In [1]:
!pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain ragas datasets cohere

In [2]:
import dotenv
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings
dotenv.load_dotenv()
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness,
)
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
import os
import pandas as pd
import numpy as np
import sys
sys.tracebacklimit = 0
from langchain_community.document_loaders import PyPDFLoader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL")
EMBEDDING_DEPLOYMENT = os.environ.get("EMBEDDING_DEPLOYMENT")
OPENAI_MODEL_GPT4 = os.environ.get("OPENAI_MODEL_GPT4")
OPENAI_DEPLOYMENT_GPT4 = os.environ.get("OPENAI_DEPLOYMENT_GPT4")

# Evaluation part

### Ground truth

In [4]:
questions = [
    "What is RAG?",
    "In the framework for implementing generative AI services using the Retrieval-Augmented Generation (RAG) model, what specific open-source products are chosen for vectorized databases?",
    "What does CRUD stand for when it comes to RAG?",
    "What evaluation metrics can be used to evaluate RAG systems?",
    "What are the four fundamental abilities required for Retrieval-Augmented Generation (RAG) according to Chen et al., and how are they evaluated in the Retrieval-Augmented Generation Benchmark (RGB)?",
    "What method does the Thulke et al. propose to address the computational complexity issue in the knowledge selection task, and how does it achieve a significant reduction in computation time?",
    "How does the proposed system by Matei Bucur address the limitations related to the availability of relevant external documents for form filling in the broader context?",
    "What embedding methods can be used for RAG systems?",
    "According to the experimental results by Wu et al., how much did the second strategy, based on the absence of detected hallucination spans, reduce the hallucination rate for the GPT-3.5-Turbo and GPT-4 models in comparison to random selection?",
    "What are some of the best retrieval strategies in RAG systems?"
]

ground_truths = [
    ["RAG stands for Retrieval Augmented Generation. It is a technique developed to address the challenge of requiring external data for generating more cohesive answers. The approach involves fetching and incorporating information into the prompt, allowing the model to generate responses on subjects and data not encountered during training. This aims to reduce the occurrence of hallucinations in the model's outputs."],
    ["Chroma DB and FAISS."],
    ["In 'CREATE', the system improves the input text by adding relevant information from external sources, making creative outputs such as poetry, stories, or code. In 'READ', the system uses external knowledge retrieval to answer questions, solving problems in question-answering, dialogue, and reasoning, and increasing understanding of the input text. In 'UPDATE', the system fixes errors in the input text using retrieved content, correcting spelling, grammar or factual errors to make the text better. In 'DELETE', the system simplifies the input by improving retrieval results, removing unnecessary details, and doing tasks like text summarization or simplification"],
    ["Accuracy, Rejection rate, Error detection rate, Baseline model Reflexion, Precision of reasoning, Multi-hop reasoning accuracy, Mean Recriprocal Rank (MRR@k), Recall@k, Relative Maximum Score, Mean Average Precision at K (MAP@K), Mean Reciprocal Rank at K (MRR@K),  Hit Rate at K (Hit@K), Exact Match (EM), F1 score, Precision,  Recall, perplexity (PPL), GLEU, BLEU, ROUGE, METEOR,  BERTScore, EM score (Exact Match score), RAGQuestEval, Faithfulness, Recall@1 (R@1), BLEU-1, METEOR, ROUGE-L"],
    ["The four fundamental abilities required for RAG, as mentioned in the paper, are Noise Robustness, Negative Rejection, Information Integration, and Counterfactual Robustness. These abilities are evaluated in RGB through four separate testbeds, each focusing on one of these abilities. The testbeds are constructed based on the common challenges in RAG and include instances that assess the performance of large language models in terms of noise robustness, negative rejection, information integration, and counterfactual robustness."],
    ["They propose two methods to address the computational complexity issue in the knowledge selection task. The first method involves hierarchical selection, which splits the task into stages, reducing the computational cost significantly. The second method employs Dense Knowledge Retrieval, utilizing siamese sequence embedding networks for efficient document retrieval. This method achieves a more than 100x reduction in computation time compared to the baseline, albeit with a 5-6% degradation in R@1 compared to the hierarchical selection method."],
    ["The proposed system addresses the limitations related to the availability of relevant external documents for form filling in the broader context by encouraging users to upload all documents relevant to the specific form-filling task, such as FAQs, policies, guidelines, and rules. These documents are used to create a search index, and the retrieval mechanism leverages this index to generate form completions. Additionally, the system allows users to provide a brief description of the intent and purpose of the form, adding extra short-form documents to enhance the understanding of the task. This approach aims to ensure that the system has access to the necessary information, making it more adaptable to different form-filling scenarios."],
    ["Custom ADA-002, BM25, Customization techniques involving adaptations like Multiple Negative Ranking Loss and Mean Squared Error (MSE) Loss, text-embedding-ada-002, text-search-ada-query-001, llm-embedder, bge-large-en-v1.5, jina-embeddings-v2-base-en, e5-base-v2,  instructor-large, voyage-02."],
    ["The second strategy, based on the absence of detected hallucination spans, achieved a reduction in hallucination rates of 52.9% for GPT-3.5-Turbo and 41% for GPT-4 models, compared to random selection"],
    ["Hybrid search, reranker, multi-time retrieval approach, vector search, BM25, hierarchical selection, prompt engineering."]
]
answers_llm = []
contexts_llm = [[""],[""],[""],[""],[""],[""],[""],[""],[""],[""]]

In [5]:
embeddings_client = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_version=OPENAI_API_VERSION)
llm = AzureChatOpenAI(model_name=OPENAI_MODEL, azure_deployment=OPENAI_DEPLOYMENT,temperature=0)
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [6]:
def evaluation_llm(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
    )
    return result

In [25]:
def evaluation_rag(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  # most likely
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

In [8]:
columns = ["System", "Faithfulness", "Answer Relevancy", "Context Precision", "Context Recall", "Answer Similarity", "Answer Correctness"]
results_df = pd.DataFrame(columns=columns)

In [9]:
max_average = 0
def find_highest(average_score):
    global max_average
    if average_score > max_average:
        max_average = average_score
        print("This is the new best value!")

In [10]:
def dictionary(result):
    dict_result = dict(result)
    average_score = sum(dict_result.values()) / len(dict_result)
    print(f"The average score is: {average_score}")
    find_highest(average_score)
    return average_score

In [11]:
def evaluate_system(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_rag(questions, answers, contexts, ground_truths)
    average = dictionary(result)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Faithfulness": result["faithfulness"],
        "Answer Relevancy": result["answer_relevancy"],
        "Context Precision": result["context_precision"],
        "Context Recall": result["context_recall"],
        "Answer Similarity": result["answer_similarity"],
        "Answer Correctness": result["answer_correctness"],
        "Average": average
    }
    df_system_results = pd.DataFrame([system_results])
    return df_system_results

In [12]:
def evaluate_LLM(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_rag(questions, answers, contexts, ground_truths)
    average = dictionary(result)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Faithfulness": result["faithfulness"],
        "Answer Relevancy": result["answer_relevancy"],
        "Context Precision": np.nan,
        "Context Recall": np.nan,
        "Answer Similarity": result["answer_similarity"],
        "Answer Correctness": result["answer_correctness"],
        "Average": average
    }
    df_llm_results = pd.DataFrame([system_results])
    return df_llm_results

### General answers by LLM

In [13]:
template = """{question}"""
prompt = ChatPromptTemplate.from_template(template)

In [14]:
llm_chain =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm}
)
llm_chain_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm_gpt4}
)

In [15]:
for query in questions:
    response = llm_chain.invoke({"question": query})
    answers_llm.append(response["response"].content)

In [16]:
llm_results = evaluate_LLM("GPT-3.5", questions, answers_llm, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results], ignore_index=True)
print(llm_results)

[[''], [''], [''], [''], [''], [''], [''], [''], [''], ['']]
<class 'list'>


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.98it/s]


The average score is: 0.6381804570446418
This is the new best value!
    System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  GPT-3.5           1.0          0.581812                NaN             NaN   

   Answer Similarity  Answer Correctness  Average  
0           0.863942            0.483329  0.63818  


  results_df = pd.concat([results_df, llm_results], ignore_index=True)


In [17]:
answers_llm_gpt4 = []
for query in questions:
    response = llm_chain_gpt4.invoke({"question": query})
    answers_llm_gpt4.append(response["response"].content)
llm_results_gpt4 = evaluate_LLM("GPT-4",questions, answers_llm_gpt4, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results_gpt4], ignore_index=True)
print(llm_results_gpt4)

[[''], [''], [''], [''], [''], [''], [''], [''], [''], ['']]
<class 'list'>


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:16<00:00,  3.73it/s]


The average score is: 0.6302111460020389
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  GPT-4           1.0          0.567018                NaN             NaN   

   Answer Similarity  Answer Correctness   Average  
0            0.86454            0.449709  0.630211  


In [18]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.581812,,,0.863942,0.483329,0.63818
1,GPT-4,1.0,0.567018,,,0.86454,0.449709,0.630211


### Naive RAG

In [19]:
template = """User input {question}. 
Context {context}."""
prompt = ChatPromptTemplate.from_template(template)

In [20]:
def retrieval_chain(prompt, retriever, llm):
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
    )
    return retrieval_augmented_qa_chain

In [21]:
loader = DirectoryLoader('../papers', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [22]:
text_splitter = CharacterTextSplitter()
chunks = text_splitter.split_documents(documents)
db_naive = Chroma.from_documents(chunks, embeddings_client, persist_directory = "../papers/vectordb-edit/naive")
db_naive.persist()
retriever_naive = db_naive.as_retriever()

In [26]:
db_naive = Chroma(persist_directory = "../papers/vectordb-edit/naive", embedding_function=embeddings_client)
retriever_naive = db_naive.as_retriever()

In [23]:
answers_naive = []
contexts_naive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_naive, llm).invoke({"question": query})
        # Access the response content
        answers_naive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_naive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_naive.append("No answer")
        context_full = retriever_naive.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_naive.append(context_content)

In [24]:
result_naive_rag = evaluate_system("Naive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_naive_rag], ignore_index=True)
print(result_naive_rag)

<class 'list'>


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  25%|██▌       | 15/60 [00:04<00:10,  4.32it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:06<00:00,  8.84it/s]


The average score is: 0.8899139161099786
This is the new best value!
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  Naive      0.629361          0.966667           0.932998        0.963889   

   Answer Similarity  Answer Correctness   Average  
0           0.963462            0.883107  0.889914  


## try recursive text splitter

In [26]:
text_splitter = text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
chunks_r = text_splitter.split_documents(documents)
db_basic = Chroma.from_documents(chunks_r, embeddings_client, persist_directory = "../papers/vectordb-edit/recursive_basic")
retriever_basic = db_basic.as_retriever()

In [27]:
db_basic.persist()

In [None]:
db_basic = Chroma(persist_directory = "../papers/vectordb-edit/recursive_basic", embedding_function=embeddings_client)
retriever_basic = db_basic.as_retriever()

In [28]:
answers_recursive = []
contexts_recursive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_basic, llm).invoke({"question": query})
        # Access the response content
        answers_recursive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_recursive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_recursive.append("No answer")
        context_full = retriever_basic.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_recursive.append(context_content)

In [29]:
result_recursive = evaluate_system("Recursive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_recursive], ignore_index=True)
result_recursive

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  28%|██▊       | 17/60 [00:04<00:09,  4.70it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:07<00:00,  8.45it/s]


The average score is: 0.8923639412147183
This is the new best value!


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,Recursive,0.626769,0.977778,0.939179,0.963889,0.963462,0.883107,0.892364


In [30]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.581812,,,0.863942,0.483329,0.63818
1,GPT-4,1.0,0.567018,,,0.86454,0.449709,0.630211
2,Naive,0.629361,0.966667,0.932998,0.963889,0.963462,0.883107,0.889914
3,Recursive,0.626769,0.977778,0.939179,0.963889,0.963462,0.883107,0.892364


## chunk size change

In [32]:
def run_and_evaluate(name, retriever, prompt, llm, results_df):
    answers = []
    contexts_extra = []

    for query in questions:
        try:  
            response = retrieval_chain(prompt, retriever, llm).invoke({"question": query})
            # Access the response content
            answers.append(response["response"].content)
            # Access the context content
            context_content = [context.page_content for context in response["context"]]
            contexts_extra.append(context_content)  
        except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers.append("No answer")
            context_full = retriever.get_relevant_documents(query)
            context_content = [context.page_content for context in context_full]
            contexts_extra.append(context_content)

    result = evaluate_system(name, questions, answers, contexts_extra, ground_truths)
    results_df = pd.concat([results_df, result], ignore_index=True)
    return result, results_df

In [35]:
# # THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 0)
# chunks_1000 = text_splitter.split_documents(documents)
# print(len(chunks_1000))
# db_1000 = Chroma.from_documents(chunks_1000, embeddings_client, persist_directory = "../papers/vectordb-edit/recursive_1000")
# db_1000.persist()

55


In [None]:
# db_1000 = Chroma(persist_directory = "../papers/vectordb-edit/recursive_1000", embedding_function=embeddings_client)

In [39]:
# retriever_1000 = db_1000.as_retriever()
# result_1000_0 = run_and_evaluate("Chunk 1000, overlap 0%",retriever_1000, prompt, llm, results_df)
# print("CHUNK SIZE 1000, 0% overlap")
# result_1000_0

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.55it/s]


The average score is: 0.7505457735075017
This is the new best value!
CHUNK SIZE 1000, 0% overlap


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk 1000, overlap 0%",0.716667,0.599047,0.9,0.98,0.847668,0.459893,0.750546


In [34]:
chunk_sizes = [500, 1000, 2000, 3000]
overlap_percentages = [5, 10, 15, 20]

for chunk_size in chunk_sizes:
    for overlap_percentage in overlap_percentages:
        # Calculate overlap based on percentage
        chunk_overlap = int(chunk_size * overlap_percentage / 100)
        
        # Create text splitter
        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        
        # Split documents
        chunks = text_splitter.split_documents(documents)
        
        # Print number of chunks
        print(f"Number of chunks for chunk size {chunk_size}, overlap {overlap_percentage}%: {len(chunks)}")
        
        # Create Chroma database
        db = Chroma.from_documents(chunks, embeddings_client, persist_directory=f"../papers/vectordb-edit/chunking_{chunk_size}_{overlap_percentage}")
        db.persist()
        
        # Create retriever
        retriever = db.as_retriever()
        
        # Run and evaluate
        result, results_df = run_and_evaluate(f"Chunk {chunk_size}, overlap {overlap_percentage}%", retriever, prompt, llm, results_df)
        print(f"CHUNK SIZE {chunk_size}, {overlap_percentage}% overlap:")
        print(result)

Number of chunks for chunk size 500, overlap 5%: 660


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  17%|█▋        | 10/60 [00:03<00:12,  3.87it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.92it/s]


The average score is: 0.8995507723110836
CHUNK SIZE 500, 5% overlap:
                  System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 5%      0.976053           0.97992           0.940768   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             1.0           0.899326            0.601237  0.899551  
Number of chunks for chunk size 500, overlap 10%: 679


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  7.47it/s]


The average score is: 0.8313719761373116
CHUNK SIZE 500, 10% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 10%           1.0          0.861279           0.872222   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.78           0.891491             0.58324  0.831372  
Number of chunks for chunk size 500, overlap 15%: 695


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  7.49it/s]


The average score is: 0.8941619865386087
CHUNK SIZE 500, 15% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 15%      0.962963          0.944893           0.941667   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             0.9           0.901248            0.714201  0.894162  
Number of chunks for chunk size 500, overlap 20%: 712


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-750' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-751' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-752' coro=<AsyncClient.aclose() done, defined

The average score is: 0.8984910245400295
CHUNK SIZE 500, 20% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 20%      0.972222          0.958276              0.975   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             0.9           0.903441            0.682007  0.898491  
Number of chunks for chunk size 1000, overlap 5%: 380


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:05<00:00, 10.19it/s]


The average score is: 0.9004659999164497
CHUNK SIZE 1000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 5%           1.0          0.954163           0.963889   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             0.9           0.903903            0.680841  0.900466  
Number of chunks for chunk size 1000, overlap 10%: 384


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.82it/s]


The average score is: 0.9178452285965846
This is the new best value!
CHUNK SIZE 1000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 10%           1.0          0.944514           0.936111   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             1.0           0.905397            0.721049  0.917845  
Number of chunks for chunk size 1000, overlap 15%: 385


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.66it/s]


The average score is: 0.9158128837545355
CHUNK SIZE 1000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 15%           1.0          0.959255              0.975   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.954762           0.903965            0.701894  0.915813  
Number of chunks for chunk size 1000, overlap 20%: 388


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:05<00:00, 10.28it/s]


The average score is: 0.8744523443241207
CHUNK SIZE 1000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 20%           0.9          0.926928           0.938889   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0           0.925           0.903688             0.65221  0.874452  
Number of chunks for chunk size 2000, overlap 5%: 253


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  22%|██▏       | 13/60 [00:04<00:14,  3.30it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.35it/s]


The average score is: 0.8983840200557323
CHUNK SIZE 2000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 5%      0.619888               1.0           0.936237   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.963889           0.988462            0.881828  0.898384  
Number of chunks for chunk size 2000, overlap 10%: 253


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  17%|█▋        | 10/60 [00:04<00:20,  2.39it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:06<00:00,  9.09it/s]


The average score is: 0.9031998186367467
CHUNK SIZE 2000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 10%      0.660727               1.0           0.923042   

   Context Recall  Answer Similarity  Answer Correctness  Average  
0        0.963889           0.988462            0.883079   0.9032  
Number of chunks for chunk size 2000, overlap 15%: 253


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   8%|▊         | 5/60 [00:04<00:37,  1.49it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:07<00:00,  8.43it/s]


The average score is: 0.8959771345180569
CHUNK SIZE 2000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 15%      0.648617               1.0           0.930926   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.923889           0.988462            0.883969  0.895977  
Number of chunks for chunk size 2000, overlap 20%: 253


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  20%|██        | 12/60 [00:04<00:13,  3.57it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  92%|█████████▏| 55/60 [00:05<00:00, 16.71it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  7.43it/s]


The average score is: 0.8873481963404403
CHUNK SIZE 2000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 20%      0.615365               1.0            0.92364   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.915556           0.988462            0.881066  0.887348  
Number of chunks for chunk size 3000, overlap 5%: 252


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  32%|███▏      | 19/60 [00:04<00:07,  5.72it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:07<00:00,  8.07it/s]


The average score is: 0.8906974455207269
CHUNK SIZE 3000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 5%      0.609161               1.0           0.926964   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.963889           0.963462            0.880709  0.890697  
Number of chunks for chunk size 3000, overlap 10%: 252


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  32%|███▏      | 19/60 [00:04<00:08,  4.68it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:07<00:00,  8.49it/s]


The average score is: 0.8888140932572158
CHUNK SIZE 3000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 10%      0.598154               1.0           0.926966   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.963889           0.963462            0.880414  0.888814  
Number of chunks for chunk size 3000, overlap 15%: 252


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  23%|██▎       | 14/60 [00:04<00:15,  2.92it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:06<00:00,  8.68it/s]


The average score is: 0.8934493967791844
CHUNK SIZE 3000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 15%       0.62451               1.0           0.927038   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.963889           0.963462            0.881798  0.893449  
Number of chunks for chunk size 3000, overlap 20%: 252


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  30%|███       | 18/60 [00:04<00:08,  5.22it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:07<00:00,  8.20it/s]


The average score is: 0.8989719714549104
CHUNK SIZE 3000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 20%      0.629029               1.0           0.930217   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.963889           0.988462            0.882235  0.898972  


In [None]:
results_df.to_csv(f"../papers/results/results.csv", index=False)

In [None]:
# # THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 50)
# chunks_1000_5 = text_splitter.split_documents(documents)
# print(len(chunks_1000))
# db_1000_5 = Chroma.from_documents(chunks_1000_5, embeddings_client, persist_directory = "../papers/vectordb-edit/recursive_1000_5")
# db_1000_5.persist()

In [None]:
# retriever_1000_5 = db_1000_5.as_retriever()
# result_1000_5 = run_and_evaluate("Chunk 1000, overlap 0%",retriever_1000_5, prompt, llm, results_df)
# print("CHUNK SIZE 1000, 5% overlap")
# result_1000_5

In [None]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
# chunks_500 = text_splitter.split_documents(documents)
# print(len(chunks_500))
# db_500 = Chroma.from_documents(chunks_500, embeddings_client, persist_directory = "../papers/vectordb-edit/recursive_500")
# retriever_500 = db_500.as_retriever()
# result_500 = change_chunk_size(retriever_500)
# print("CHUNK SIZE 500")
# print(result_500)

In [None]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 200)
# chunks_2000 = text_splitter.split_documents(documents)
# print(len(chunks_2000))
# db_2000 = Chroma.from_documents(chunks_2000, embeddings_client, persist_directory = "../papers/vectordb-edit/recursive_2000")
# retriever_2000 = db_2000.as_retriever()
# result_2000 = change_chunk_size(retriever_2000)
# print("CHUNK SIZE 2000")
# print(result_2000)

In [None]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 3000, chunk_overlap = 300)
# chunks_3000 = text_splitter.split_documents(documents)
# print(len(chunks_3000))
# db_3000 = Chroma.from_documents(chunks_3000, embeddings_client, persist_directory = "../papers/vectordb-edit/recursive_3000")
# retriever_3000 = db_3000.as_retriever()
# result_3000 = change_chunk_size(retriever_3000)
# print("CHUNK SIZE 3000")
# print(result_3000)

In [35]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.581812,,,0.863942,0.483329,0.63818
1,GPT-4,1.0,0.567018,,,0.86454,0.449709,0.630211
2,Naive,0.629361,0.966667,0.932998,0.963889,0.963462,0.883107,0.889914
3,Recursive,0.626769,0.977778,0.939179,0.963889,0.963462,0.883107,0.892364
4,"Chunk 500, overlap 0%",1.0,0.775448,0.894444,0.75,0.888674,0.566091,0.812443
5,"Chunk 1000, overlap 0%",1.0,0.956411,0.983333,0.96,0.905858,0.642752,0.908059
6,"Chunk 2000, overlap 0%",0.6323,1.0,0.936453,0.963889,0.988462,0.883719,0.900804
7,"Chunk 3000, overlap 0%",0.630187,1.0,0.92645,0.963889,0.963462,0.882235,0.89437
8,"Chunk 500, overlap 5%",0.976053,0.97992,0.940768,1.0,0.899326,0.601237,0.899551
9,"Chunk 500, overlap 10%",1.0,0.861279,0.872222,0.78,0.891491,0.58324,0.831372


In [36]:
highest = results_df.nlargest(1, "Average")
highest

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
13,"Chunk 1000, overlap 10%",1.0,0.944514,0.936111,1.0,0.905397,0.721049,0.917845


In [37]:
results_df.to_csv(f"../papers/results/results.csv", index=False)

### now time to look for different top-k

Note: We continue with the size chunk of 1000 as it had the highest average score

In [18]:
results_df = pd.read_csv(f"../papers/results/results.csv")

In [38]:
db_k = Chroma(persist_directory=f"../papers/vectordb-edit/chunking_1000_10", embedding_function=embeddings_client)

In [39]:
k_values = [2, 3, 5, 6, 7]

# Iterate over different k values
for k in k_values:
    # Create retriever with k value
    retriever = db_k.as_retriever(search_kwargs={"k": k})
    
    # Run and evaluate
    result, results_df = run_and_evaluate(f"Chunk size 1000, overlap 10%, K={k}", retriever, prompt, llm, results_df)
    print(f"Results for K={k}:")
    print(result)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  7.03it/s]


The average score is: 0.8595588343265853
Results for K=2:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 1000, overlap 10%, K=2      0.868333          0.957011   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0                1.0        0.757143           0.900639            0.674227   

    Average  
0  0.859559  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:07<00:00,  7.69it/s]


The average score is: 0.8663941376727795
Results for K=3:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 1000, overlap 10%, K=3      0.946667          0.932556   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.966667        0.807143           0.900012            0.645321   

    Average  
0  0.866394  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-1691' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-1692' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-1693' coro=<AsyncClient.aclose() done, defi

The average score is: 0.892436032893865
Results for K=5:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 1000, overlap 10%, K=5      0.935185          0.921362   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.903611             1.0           0.907359            0.687099   

    Average  
0  0.892436  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  22%|██▏       | 13/60 [00:04<00:16,  2.81it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.62it/s]


The average score is: 0.8723694090203784
Results for K=6:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 1000, overlap 10%, K=6      0.738083          0.951749   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.888203        0.953667           0.866972            0.835543   

    Average  
0  0.872369  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.41it/s]


The average score is: 0.8658537789971031
Results for K=7:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 1000, overlap 10%, K=7      0.925926          0.892252   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.901504             0.9           0.902719            0.672722   

    Average  
0  0.865854  


In [35]:
results_df.to_csv(f"../papers/results/results.csv", index=False)

In [28]:
# retriever_2 = db.as_retriever(search_kwargs={"k": 2})
# result_2 = run_and_evaluate("K=2",retriever_2, prompt, llm, results_df)
# print("CHUNK SIZE 1000, K=2")
# print(result_2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.88it/s]


CHUNK SIZE 1000, K=2
{'faithfulness': 1.0000, 'answer_relevancy': 0.7317, 'context_precision': 0.9000, 'context_recall': 1.0000, 'answer_similarity': 0.9071, 'answer_correctness': 0.5908}


In [29]:
# retriever_3 = db_1000.as_retriever(search_kwargs={"k": 3})
# result_3 = change_chunk_size(retriever_3)
# print("CHUNK SIZE 1000, K=3")
# print(result_3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.33it/s]


CHUNK SIZE 1000, K=3
{'faithfulness': 1.0000, 'answer_relevancy': 0.7832, 'context_precision': 0.9333, 'context_recall': 0.9250, 'answer_similarity': 0.9326, 'answer_correctness': 0.6117}


In [30]:
# retriever_5 = db_1000.as_retriever(search_kwargs={"k": 5})
# result_5 = change_chunk_size(retriever_5)
# print("CHUNK SIZE 1000, K=5")
# print(result_5)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.03it/s]


CHUNK SIZE 1000, K=5
{'faithfulness': 0.9028, 'answer_relevancy': 0.6788, 'context_precision': 0.9333, 'context_recall': 1.0000, 'answer_similarity': 0.9250, 'answer_correctness': 0.6024}


### look for different retrievers

4 chunks was the best score

#### parent document retriever

In [40]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

parent_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap = 200)
child_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents",persist_directory = "../papers/vectordb-edit/parent-qa", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
parent_document_retriever.add_documents(documents)
result_parent, results_df = run_and_evaluate(f"Parent Retriever 1000-200", parent_document_retriever, prompt, llm, results_df)
print(result_parent)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  7.42it/s]


The average score is: 0.8579152257448598
                      System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 1000-200      0.949074           0.83992   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.972222            0.85           0.900484            0.635791   

    Average  
0  0.857915  


In [41]:
parent_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap = 50)
child_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_small",persist_directory = "../papers/vectordb-edit/parent_small-qa", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_small = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_small,
    parent_splitter=parent_splitter_small,
)
parent_document_retriever_small.add_documents(documents)
result_parent_small, results_df = run_and_evaluate(f"Parent Retriever 500-100", parent_document_retriever_small, prompt, llm, results_df)
print(result_parent_small)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.33it/s]


The average score is: 0.8874619132516387
                     System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 500-100           1.0          0.943953   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.913889           0.925           0.899494            0.642435   

    Average  
0  0.887462  


In [42]:
parent_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1500, chunk_overlap = 150)
child_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_large",persist_directory = "../papers/vectordb-edit/parent_large-qa", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_large = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_large,
    parent_splitter=parent_splitter_large,
)
parent_document_retriever_large.add_documents(documents)
result_parent_large , results_df = run_and_evaluate(f"Parent Retriever 1500-200", parent_document_retriever_large, prompt, llm, results_df)
print(result_parent_large)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  33%|███▎      | 20/60 [00:04<00:08,  4.87it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.75it/s]


The average score is: 0.8426689968348454
                      System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 1500-200      0.653428           0.94709   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.829265         0.90119           0.885671            0.839369   

    Average  
0  0.842669  


In [43]:
highest = results_df.nlargest(1, "Average")
highest

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
13,"Chunk 1000, overlap 10%",1.0,0.944514,0.936111,1.0,0.905397,0.721049,0.917845


#### Maximum marginal relevance retrieval

In [44]:
retriever_mmr = db_k.as_retriever(search_type="mmr")
result_mmr, results_df = run_and_evaluate(f"MMR", retriever_mmr, prompt, llm, results_df)
print("Marginal relevance")
print(result_mmr)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   2%|▏         | 1/60 [00:01<01:43,  1.75s/it]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.64it/s]


The average score is: 0.838339932479549
Marginal relevance
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0    MMR      0.905192          0.869071           0.889952            0.85   

   Answer Similarity  Answer Correctness  Average  
0             0.9013            0.614524  0.83834  


#### BM25

In [47]:
from langchain.retrievers import BM25Retriever
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
chunks_1000_100 = text_splitter.split_documents(documents)
retriever_bm25 = BM25Retriever.from_documents(chunks_1000_100)
result_bm25, results_df = run_and_evaluate(f"BM25", retriever_bm25, prompt, llm, results_df)
print("BM25")
print(result_bm25)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   2%|▏         | 1/60 [00:01<01:43,  1.75s/it]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.84it/s]


The average score is: 0.8170132577714905
BM25
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0   BM25      0.948826          0.829658           0.775303        0.854762   

   Answer Similarity  Answer Correctness   Average  
0           0.896502            0.597029  0.817013  


#### Ensambler - Hybrid

In [48]:
from langchain.retrievers import EnsembleRetriever
ret = db_k.as_retriever()
ensemble_retriever_1 = EnsembleRetriever(retrievers=[retriever_bm25, ret], weights=[0.75, 0.25])
result_ensemble1, results_df = run_and_evaluate(f"Ensambler 1", ensemble_retriever_1, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble1)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.94it/s]


The average score is: 0.8564256858539333
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 1           1.0          0.887284           0.774095   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             1.0           0.894385            0.582789  0.856426  


In [49]:
ensemble_retriever_2 = EnsembleRetriever(retrievers=[retriever_bm25, ret], weights=[0.5, 0.5])
result_ensemble2, results_df = run_and_evaluate(f"Ensambler 2", ensemble_retriever_2, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.73it/s]


The average score is: 0.8558923090987111
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 2         0.975          0.823256           0.828897   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.966667            0.89855            0.642985  0.855892  


In [50]:
ensemble_retriever_3 = EnsembleRetriever(retrievers=[retriever_bm25, ret], weights=[0.25,0.75])
result_ensemble3, results_df = run_and_evaluate(f"Ensambler 3", ensemble_retriever_3, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.62it/s]


The average score is: 0.8530856747444694
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 3      0.885714          0.815345           0.915738   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             0.9           0.899274            0.702443  0.853086  


In [51]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.581812,,,0.863942,0.483329,0.63818
1,GPT-4,1.0,0.567018,,,0.86454,0.449709,0.630211
2,Naive,0.629361,0.966667,0.932998,0.963889,0.963462,0.883107,0.889914
3,Recursive,0.626769,0.977778,0.939179,0.963889,0.963462,0.883107,0.892364
4,"Chunk 500, overlap 0%",1.0,0.775448,0.894444,0.75,0.888674,0.566091,0.812443
5,"Chunk 1000, overlap 0%",1.0,0.956411,0.983333,0.96,0.905858,0.642752,0.908059
6,"Chunk 2000, overlap 0%",0.6323,1.0,0.936453,0.963889,0.988462,0.883719,0.900804
7,"Chunk 3000, overlap 0%",0.630187,1.0,0.92645,0.963889,0.963462,0.882235,0.89437
8,"Chunk 500, overlap 5%",0.976053,0.97992,0.940768,1.0,0.899326,0.601237,0.899551
9,"Chunk 500, overlap 10%",1.0,0.861279,0.872222,0.78,0.891491,0.58324,0.831372


#### Multi-stage - reranker

In [46]:
%pip install --upgrade --quiet  cohere

Note: you may need to restart the kernel to use updated packages.


In [47]:
import getpass

os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [67]:
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever

retriever_context = ret
compressor = CohereRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, 
    base_retriever=retriever_context
)

result_compression, results_df = run_and_evaluate(f"Cohere reranker", compression_retriever, prompt, llm, results_df)
print("Reranker")
print(result_compression)



TypeError: BaseCohere.rerank() takes 1 positional argument but 4 positional arguments (and 2 keyword-only arguments) were given

#### creating context by remaking the query

In [52]:
highest = results_df.nlargest(1, "Average")
highest

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
13,"Chunk 1000, overlap 10%",1.0,0.944514,0.936111,1.0,0.905397,0.721049,0.917845


In [53]:
template_context = "Generate a search query to fetch the relevant documents using the user's {question}. Craft a query that specifically targets the keywords in the question. In the answer provide only the query."
prompt_context = ChatPromptTemplate.from_template(template_context)

In [54]:
answers_final = []
contexts_final = []
# retriever_context_q = EnsembleRetriever(retrievers=[retriever_bm25, retriever_3], weights=[0.5, 0.5])
llm_for_context =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm}
)
for query in questions:
    response_check = llm_for_context.invoke({"question": query})
    search_query = response_check["response"].content
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
)
    docs = ret.get_relevant_documents(search_query)
    formatted_docs = []
    for doc in docs:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final.append(response["response"].content)
            contexts_final.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final.append("No answer")
            contexts_final.append(formatted_docs)


result_search_query = evaluation_rag(questions, answers_final, contexts_final, ground_truths)
result_search_query

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:07<00:00,  8.52it/s]


{'faithfulness': 1.0000, 'answer_relevancy': 0.9439, 'context_precision': 0.8306, 'context_recall': 0.8000, 'answer_similarity': 0.9013, 'answer_correctness': 0.6398}

In [55]:
average = dictionary(result_search_query)
    # Create a dictionary to store the results
system_results = {
        "System": "Search query",
        "Faithfulness": result_search_query["faithfulness"],
        "Answer Relevancy": result_search_query["answer_relevancy"],
        "Context Precision": result_search_query["context_precision"],
        "Context Recall": result_search_query["context_recall"],
        "Answer Similarity": result_search_query["answer_similarity"],
        "Answer Correctness": result_search_query["answer_correctness"],
        "Average": average
    }
df_result_search_query = pd.DataFrame([system_results])

The average score is: 0.8525949606908769


In [56]:
results_df = pd.concat([results_df, df_result_search_query], ignore_index=True)
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.581812,,,0.863942,0.483329,0.63818
1,GPT-4,1.0,0.567018,,,0.86454,0.449709,0.630211
2,Naive,0.629361,0.966667,0.932998,0.963889,0.963462,0.883107,0.889914
3,Recursive,0.626769,0.977778,0.939179,0.963889,0.963462,0.883107,0.892364
4,"Chunk 500, overlap 0%",1.0,0.775448,0.894444,0.75,0.888674,0.566091,0.812443
5,"Chunk 1000, overlap 0%",1.0,0.956411,0.983333,0.96,0.905858,0.642752,0.908059
6,"Chunk 2000, overlap 0%",0.6323,1.0,0.936453,0.963889,0.988462,0.883719,0.900804
7,"Chunk 3000, overlap 0%",0.630187,1.0,0.92645,0.963889,0.963462,0.882235,0.89437
8,"Chunk 500, overlap 5%",0.976053,0.97992,0.940768,1.0,0.899326,0.601237,0.899551
9,"Chunk 500, overlap 10%",1.0,0.861279,0.872222,0.78,0.891491,0.58324,0.831372


### change model to GPT-4

In [57]:
highest = results_df.nlargest(3, "Average")
highest

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
13,"Chunk 1000, overlap 10%",1.0,0.944514,0.936111,1.0,0.905397,0.721049,0.917845
14,"Chunk 1000, overlap 15%",1.0,0.959255,0.975,0.954762,0.903965,0.701894,0.915813
5,"Chunk 1000, overlap 0%",1.0,0.956411,0.983333,0.96,0.905858,0.642752,0.908059


In [59]:
result_gpt4_1000_10, results_df = run_and_evaluate(f"Chunk 1000, overlap 10%, GPT-4", ret, prompt, llm_gpt4, results_df)
result_gpt4_1000_10

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:07<00:00,  7.85it/s]


The average score is: 0.8902044941248054


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk 1000, overlap 10%, GPT-4",0.9,0.944886,0.944444,1.0,0.904391,0.647505,0.890204


In [60]:
db_15 = Chroma(persist_directory = "../papers/vectordb-edit/chunking_1000_15", embedding_function=embeddings_client)
ret_15 = db_15.as_retriever()
result_search_query_gpt4, results_df = run_and_evaluate(f"Chunk 1000, overlap 15%, GPT-4", ret_15, prompt, llm_gpt4, results_df)
result_search_query_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.32it/s]


The average score is: 0.8921944588786568


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk 1000, overlap 15%, GPT-4",0.888889,0.949334,0.963889,0.969048,0.903324,0.678684,0.892194


In [61]:
db_0 = Chroma(persist_directory = "../papers/vectordb-edit/chunking_1000_0", embedding_function=embeddings_client)
ret_0 = db_0.as_retriever()
result_search_query_gpt4, results_df = run_and_evaluate(f"Chunk 1000, overlap 0%, GPT-4", ret_0, prompt, llm_gpt4, results_df)
result_search_query_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:07<00:00,  7.57it/s]


The average score is: 0.9124498020107327


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk 1000, overlap 0%, GPT-4",1.0,0.949857,0.991667,0.95,0.902345,0.68083,0.91245


In [62]:
results_df.to_csv(f"../papers/results/results_qa.csv", index=False)