In [1]:
import dotenv
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings
dotenv.load_dotenv()
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness,
)
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
import os
import pandas as pd
import numpy as np
import sys
sys.tracebacklimit = 0
from langchain_community.document_loaders import PyPDFLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL")
EMBEDDING_DEPLOYMENT = os.environ.get("EMBEDDING_DEPLOYMENT")
OPENAI_MODEL_GPT4 = os.environ.get("OPENAI_MODEL_GPT4")
OPENAI_DEPLOYMENT_GPT4 = os.environ.get("OPENAI_DEPLOYMENT_GPT4")

In [3]:
embeddings_client = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_version=OPENAI_API_VERSION)
llm = AzureChatOpenAI(model_name=OPENAI_MODEL, azure_deployment=OPENAI_DEPLOYMENT,temperature=0)
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [4]:
def evaluation_llm(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
    )
    return result

In [5]:
def evaluation_rag(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  # most likely
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

In [6]:
columns = ["System", "Faithfulness", "Answer Relevancy", "Context Precision", "Context Recall", "Answer Similarity", "Answer Correctness"]
results_df = pd.DataFrame(columns=columns)

In [7]:
max_average = 0
def find_highest(average_score):
    global max_average
    if average_score > max_average:
        max_average = average_score
        print("This is the new best value!")

In [8]:
def dictionary(result):
    dict_result = dict(result)
    average_score = sum(dict_result.values()) / len(dict_result)
    print(f"The average score is: {average_score}")
    find_highest(average_score)
    return average_score

In [9]:
def evaluate_system(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_rag(questions, answers, contexts, ground_truths)
    average = dictionary(result)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Faithfulness": result["faithfulness"],
        "Answer Relevancy": result["answer_relevancy"],
        "Context Precision": result["context_precision"],
        "Context Recall": result["context_recall"],
        "Answer Similarity": result["answer_similarity"],
        "Answer Correctness": result["answer_correctness"],
        "Average": average
    }
    df_system_results = pd.DataFrame([system_results])
    return df_system_results

In [10]:
def evaluate_LLM(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_rag(questions, answers, contexts, ground_truths)
    average = dictionary(result)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Faithfulness": result["faithfulness"],
        "Answer Relevancy": result["answer_relevancy"],
        "Context Precision": np.nan,
        "Context Recall": np.nan,
        "Answer Similarity": result["answer_similarity"],
        "Answer Correctness": result["answer_correctness"],
        "Average": average
    }
    df_llm_results = pd.DataFrame([system_results])
    return df_llm_results

In [11]:
questions = [
    "Character analysis of Sejanus Plinth",
    "Character analysis of Lucy Gray",
    "Character analysis of Tigris Snow",
    "What are the places 1-10 in the 10th Hunger Games?",
    "How does Lucy Gray win the 10th Hunger games?",
    "How and Who Came Up with the Hunger Games?",
    "Why did Snow join the mentorship program in The Ballad of Songbirds and Snake?",
    "What idea does Snow present to the Head Game Maker Volumnia Gaul?",
    "What evidence does Snow secretly record and send to Dr. Gaul, and what is the result of that?",
    "How do Snow and Lucy gain popularity in the Capitol?"
]


ground_truths = [
    ["Sejanus is one of Coriolanus’s classmates and a mentor in the Hunger Games. Coriolanus and many of his classmates treat Sejanus coldly, as Sejanus isn’t Capitol-born. Rather, his father, a munitions magnate from District Two, bought his way into Capitol high society. Sejanus resents his father and hates the Capitol; like his mother, Ma, he still feels connected to District Two and considers the district home. Coriolanus gets drawn into Sejanus’s orbit when, during the reaping, Sejanus shares that his tribute from District Two, Marcus, is a former classmate. From this moment on, Coriolanus begins saving Sejanus from danger or embarrassment on many occasions. He takes on the role of Sejanus’s mentor, encouraging Sejanus to remain loyal to the Capitol and not step too far out of line in protesting the Games. However, Sejanus remains convinced that the Games are wrong, and he goes so far as to enter the arena on the first night. He expects the tributes to kill him and believes this will make a statement to the Capitol that the Games are inhumane. Sejanus ultimately agrees to leave the arena with Coriolanus and is unsuccessful in sending the message he intended. His actions, however, get him in trouble with the Capitol. Though Sejanus resents his father’s money, Strabo buys Sejanus’s way out of trouble and allows him to join the Peacekeepers. Sejanus goes into the Peacekeepers with characteristic optimism—he wants to train as a medic and help people. But when Sejanus learns he can’t be a medic if there’s no war, he instead falls in with rebels in District 12, helps plot an escape, and purchases guns for the rebel forces. He outright ignores Coriolanus’s attempts to keep him out of trouble—but Coriolanus does end up double-crossing Sejanus. He records Sejanus’s admission that he’s helping the rebels with a jabberjay, and Dr. Gaul eventually hears the message. Sejanus is ultimately executed for treason."],
    ["Lucy Gray is the female tribute from District 12. Lucy Gray proves herself to be a cutthroat performer when, at the reaping, she slips a snake down a girl’s dress and then sings a song onstage. She loves color—she usually wears a dress of rainbow ruffles—and often uses bird imagery when she speaks. As Coriolanus gets to know her, he learns that she’s an orphan. She’s a member of the Covey, a traveling band of musicians and performers that’s now permanently based in District 12. With this information, Coriolanus presents Lucy Gray as a tribute not from District 12, but as a person who’s more like people in the Capitol. This campaign is successful; Lucy Gray becomes the fan favorite. She and Coriolanus fall in love in the week before the Games, sharing a passionate kiss—and Coriolanus gives her his mother’s compact, both as a token of his affection and so she can sneak rat poison into the arena. Throughout the Games, Lucy Gray mostly stays hidden. She kills several people with rat poison and one person with one of the neon snakes Dr. Gaul drops in the arena. After she wins the Games, the Capitol sends her home, where she rejoins the Covey. She and Coriolanus attempt to keep their romance alive while Coriolanus is stationed there, but this becomes complicated. Many of Lucy Gray’s songs are about a former lover, Billy Taupe, whom she now hates but who is still in her life. However, when Lucy Gray starts to fear that the mayor (who incorrectly believes Lucy Gray killed his daughter) is going to hurt her, Lucy Gray and Coriolanus decide to run away. Lucy Gray prizes trust and friendship, so when she infers that Coriolanus is responsible for Sejanus’s death, she tries to run away. She sets a trap for Coriolanus that results in him being bitten by a snake. After this, Coriolanus shoots at her. He isn’t sure if he hits her, but he ultimately decides he doesn’t care. Though Lucy Gray mysteriously disappears after this, several songs she wrote persist—they appear 64 years later in the Hunger Games trilogy.Lucy Gray Baird Quotes in The Ballad of Songbir"],
    ["Tigris is Coriolanus’s cousin. She’s a few years older than Coriolanus and has been living with him and the Grandma’am since the war, when she was also orphaned. Tigris is kind, caring, and intelligent. Tigris has always taken it upon herself to look out for Coriolanus. This meant that she learned to cook as a young child during the war—but she also implies that at several points, she’s turned to sex work to make ends meet. Her dream has always been to work in fashion, which she’s doing in the novel’s present. Though she’s supposedly working as an apprentice, Coriolanus suggests that her employer treats her more as a grunt, making her do unsavory or dirty tasks rather than teaching her how to make clothes. Despite this, Tigris is resourceful and excels at upcycling old garments—she manages to transform one of Crassus Snow’s stained old shirts into a gorgeous, classy garment for Coriolanus to wear to the reaping. As the Hunger Games approach and begin, Tigris becomes increasingly skeptical of the Games and of Coriolanus’s involvement in them. Particularly as she develops sympathy for Lucy Gray—and later, after she realizes Dr. Gaul forced Coriolanus into the arena, putting him in danger—she expresses that the Games are wrong and not fair to any of the children involved, mentors or tributes."],
    ["1. Lucy Gray (victor), 2. Reaper, 3. Treech, 4. Teslee, 5. Mizzen, 6. Coral, 7. Circ, 8. Wovey, 9. Tanner, 10. Lamina"],
    ["Lucy Gray plays the long game once that year’s tournament begins. She spends the first day in hiding after running into the woods instead of picking from the weapons available, which ultimately leads to what is commonly known as the “blood bath.” She then uses water bottles, poison, and snakes to kill the other tributes, though she retains her humanity and comforts one of the tributes she kills by holding him and speaking to him softly. Lucy was, however, helped by her mentor from outside the arena—he slipped a handkerchief with Lucy’s scent into a tank of snakes that were later released into the Games. This meant that the snakes didn’t hunt Lucy, and she survived their arrival in the arena. In the end, after waiting it out and cleverly avoiding death, Lucy Gray is declared the winner of the 10th annual Hunger Games—a harrowing yet tremendous victory not only for her but Snow and her home, District 12, too."],
    ["One of the many tragic backstories in Ballad, Dean Casca Highbottom is the remorseful creator of the Hunger Games. He came up with the idea for a school project, but he was drunk and never meant for it to be put into practice. Unfortunately, his friend and partner for the project, Crassus Snow, submitted the idea to their teacher, Dr. Gaul, and, to Highbottom's horror, the Hunger Games were born. After his creation comes to life, Highbottom becomes a morphling addict."],
    ["Coriolanus Snow joined the mentorship program in 'The Ballad of Songbirds and Snakes' primarily motivated by his desire to restore his family's wealth and social standing. After losing both parents in the war, Snow's family becomes destitute, relying on food rations. Living in the Capitol with his grandmother and cousin Tigress, Snow sees the mentorship program as an opportunity to potentially win the cash prize and improve his family's dire situation. Despite initial reservations about mentoring the female tribute from District 12, the poorest district in Panem, Snow's motivation evolves as he develops a connection with Lucy Gray Baird. Over time, Snow's participation is influenced by his observations of the power dynamics and suffering within the Games, leading him to propose a betting and sponsorship program that shapes the future of the Hunger Games. The story unfolds as Snow's motivations become more complex, navigating personal ambition, family restoration, and a gradual realization of societal injustices within Panem."],
    ["Coriolanus Snow presents the idea of a betting and sponsorship program to the Head Game Maker Volumnia Gaul. Tasked with writing a paper about additions to the Hunger Games, Snow suggests this program to engage Capitol citizens in supporting their favorite tributes. He sees it as an opportunity to restore his family's wealth and secure his future. The concept adds a strategic layer to the Games, allowing tributes to attract sponsors by building alliances and performing well. Despite initial suspicions from Gaul, the idea proves successful, leading to increased sponsorships and support for Snow's tribute, Lucy Gray Baird. This idea becomes instrumental in the modern Hunger Games, marking a significant development in Snow's character as he navigates manipulation, moral dilemmas, and the power dynamics within Panem's society."],
    ["Coriolanus Snow secretly records and sends evidence to Dr. Gaul, the Head Game Maker, regarding Sejanus's plot to help District residents escape. Using a Jabberyjay, Snow captures Sejanus admitting to the plan and sends the message to Gaul. This information results in Sejanus's execution and the subsequent deaths of the Mayor's daughter and another citizen."],
    ["Coriolanus Snow and Lucy Gray Baird gain popularity in the Capitol through strategic actions and captivating performances. Despite their initial differences and the challenge of being paired together, Snow and Lucy find ways to captivate the citizens of the Capitol and become influential figures.One of the key strategies that Snow and Lucy employ to gain popularity is Lucy's talent as a singer and performer."]
    ]

### General answer by LLMs

In [12]:
template = """{question}"""
prompt = ChatPromptTemplate.from_template(template)

In [13]:
llm_chain =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm}
)
llm_chain_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm_gpt4}
)

In [14]:
answers_llm = []
contexts_llm = [[""],[""],[""],[""],[""],[""],[""],[""],[""],[""]]

In [15]:
for query in questions:
    response = llm_chain.invoke({"question": query})
    answers_llm.append(response["response"].content)

In [16]:
llm_results = evaluate_LLM("GPT-3.5", questions, answers_llm, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results], ignore_index=True)
print(llm_results)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:23<00:00,  2.57it/s]


The average score is: 0.6636103441664355
This is the new best value!
    System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  GPT-3.5           1.0          0.379673                NaN             NaN   

   Answer Similarity  Answer Correctness  Average  
0           0.841376            0.410613  0.66361  


  results_df = pd.concat([results_df, llm_results], ignore_index=True)


In [17]:
answers_llm_gpt4 = []
for query in questions:
    response = llm_chain_gpt4.invoke({"question": query})
    answers_llm_gpt4.append(response["response"].content)
llm_results_gpt4 = evaluate_LLM("GPT-4",questions, answers_llm_gpt4, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results_gpt4], ignore_index=True)
print(llm_results_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.20it/s]


The average score is: 0.7423882546501969
This is the new best value!
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  GPT-4           1.0          0.695899                NaN             NaN   

   Answer Similarity  Answer Correctness   Average  
0           0.900098            0.492424  0.742388  


# RAG

### Naive RAG

In [18]:
template = """User input {question}. 
Context {context}."""
prompt = ChatPromptTemplate.from_template(template)

In [19]:
def retrieval_chain(prompt, retriever, llm):
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
    )
    return retrieval_augmented_qa_chain

In [20]:
loader = PyPDFLoader(r"..\ballad\the_ballad_of_songbirds_and_snakes.pdf")
documents = loader.load()

In [39]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = CharacterTextSplitter()
# chunks = text_splitter.split_documents(documents)
# db_naive = Chroma.from_documents(chunks, embeddings_client, persist_directory = "../ballad/vectordb/naive")
# db_naive.persist()
# retriever_naive = db_naive.as_retriever()

In [21]:
db_naive = Chroma(persist_directory = "../ballad/vectordb-edit/naive", embedding_function=embeddings_client)
retriever_naive = db_naive.as_retriever()

In [22]:
answers_naive = []
contexts_naive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_naive, llm).invoke({"question": query})
        # Access the response content
        answers_naive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_naive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_naive.append("No answer")
        context_full = retriever_naive.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_naive.append(context_content)

In [23]:
result_naive_rag = evaluate_system("Naive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_naive_rag], ignore_index=True)
print(result_naive_rag)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  7.05it/s]


The average score is: 0.7476229255841002
This is the new best value!
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  Naive      0.944444          0.831993           0.597222        0.820833   

   Answer Similarity  Answer Correctness   Average  
0           0.878165             0.41308  0.747623  


### recursive splitter

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
chunks_r = text_splitter.split_documents(documents)
db_basic = Chroma.from_documents(chunks_r, embeddings_client, persist_directory = "../ballad/vectordb/recursive_basic")
db_basic.persist()
retriever_basic = db_basic.as_retriever()

In [24]:
db_basic = Chroma(persist_directory = "../ballad/vectordb/recursive_basic", embedding_function=embeddings_client)
retriever_basic = db_basic.as_retriever()

In [25]:
answers_recursive = []
contexts_recursive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_basic, llm).invoke({"question": query})
        # Access the response content
        answers_recursive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_recursive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_recursive.append("No answer")
        context_full = retriever_basic.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_recursive.append(context_content)

In [26]:
result_recursive = evaluate_system("Recursive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_recursive], ignore_index=True)
result_recursive

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.67it/s]


The average score is: 0.7384210404925194


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,Recursive,0.944444,0.837124,0.597222,0.771429,0.878165,0.402142,0.738421


### chunk sizes

In [27]:
def run_and_evaluate(name, retriever, prompt, llm, results_df):
    answers = []
    contexts_extra = []

    for query in questions:
        try:  
            response = retrieval_chain(prompt, retriever, llm).invoke({"question": query})
            # Access the response content
            answers.append(response["response"].content)
            # Access the context content
            context_content = [context.page_content for context in response["context"]]
            contexts_extra.append(context_content)  
        except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers.append("No answer")
            context_full = retriever.get_relevant_documents(query)
            context_content = [context.page_content for context in context_full]
            contexts_extra.append(context_content)

    result = evaluate_system(name, questions, answers, contexts_extra, ground_truths)
    results_df = pd.concat([results_df, result], ignore_index=True)
    return result, results_df

In [28]:
chunk_sizes = [500, 1000, 2000, 3000]
overlap_percentages = [0, 5, 10, 15, 20]

for chunk_size in chunk_sizes:
    for overlap_percentage in overlap_percentages:
        # Calculate overlap based on percentage
        chunk_overlap = int(chunk_size * overlap_percentage / 100)
        
        # Create text splitter
        # text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        
        # # Split documents
        # chunks = text_splitter.split_documents(documents)
        
        # Print number of chunks
        print(f"Number of chunks for chunk size {chunk_size}, overlap {overlap_percentage}%")
        
        # Create Chroma database
        # db = Chroma.from_documents(chunks, embeddings_client, persist_directory=f"../ballad/vectordb-edit/chunking_{chunk_size}_{overlap_percentage}")
        db = Chroma(persist_directory = "../ballad/vectordb/recursive_basic", embedding_function=embeddings_client)
        # db.persist()
        
        # Create retriever
        retriever = db.as_retriever()
        
        # Run and evaluate
        result,results_df = run_and_evaluate(f"Chunk {chunk_size}, overlap {overlap_percentage}%", retriever, prompt, llm, results_df)
        print(f"CHUNK SIZE {chunk_size}, {overlap_percentage}% overlap:")
        print(result)

Number of chunks for chunk size 500, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.7539822774150747
This is the new best value!
CHUNK SIZE 500, 0% overlap:
                  System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 0%      0.532729          0.803469           0.907841   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.752864           0.731083            0.795908  0.753982  
Number of chunks for chunk size 500, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.93it/s]


The average score is: 0.746531580778468
CHUNK SIZE 500, 5% overlap:
                  System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 5%          0.92          0.829388           0.597222   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.808333           0.882359            0.441887  0.746532  
Number of chunks for chunk size 500, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.7503026934325662
CHUNK SIZE 500, 10% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 10%      0.531527          0.775393           0.907932   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.753544           0.731083            0.802336  0.750303  
Number of chunks for chunk size 500, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-500' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-501' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-502' coro=<AsyncClient.aclose() done, defined

The average score is: 0.7639724254546479
This is the new best value!
CHUNK SIZE 500, 15% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 15%      0.800333          0.951028           0.616525   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.783333           0.890919            0.541697  0.763972  
Number of chunks for chunk size 500, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.75935755583295
CHUNK SIZE 500, 20% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 20%      0.816002          0.950323           0.616575   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.75           0.890744            0.532502  0.759358  
Number of chunks for chunk size 1000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.764096212592404
This is the new best value!
CHUNK SIZE 1000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 0%      0.790545          0.932472           0.654176   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.783333           0.890744            0.533308  0.764096  
Number of chunks for chunk size 1000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.7494286899321573
CHUNK SIZE 1000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 5%      0.757331          0.951969           0.582974   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.783333           0.878419            0.542546  0.749429  
Number of chunks for chunk size 1000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.53it/s]


The average score is: 0.7144231231949257
CHUNK SIZE 1000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 10%      0.854167          0.746491           0.597222   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0           0.775           0.880496            0.433163  0.714423  
Number of chunks for chunk size 1000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.7551886731824956
CHUNK SIZE 1000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 15%       0.76496          0.949421           0.654338   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.75           0.878244            0.534169  0.755189  
Number of chunks for chunk size 1000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.7626237594141226
CHUNK SIZE 1000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 20%      0.786958          0.951187           0.619133   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.783333           0.890919            0.544212  0.762624  
Number of chunks for chunk size 2000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-968' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-969' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-970' coro=<AsyncClient.aclose() done, defined

The average score is: 0.7517755819928807
CHUNK SIZE 2000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 0%      0.753625          0.949662           0.616641   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.783333           0.864847            0.542546  0.751776  
Number of chunks for chunk size 2000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   2%|▏         | 1/60 [00:01<01:49,  1.85s/it]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': True, 'severity': 'medium'}, 'violence': {'filtered': False, 'sev

The average score is: 0.7442605008007958
CHUNK SIZE 2000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 5%      0.400117          0.928571           0.829433   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.633333           0.801053            0.873055  0.744261  
Number of chunks for chunk size 2000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.761945684889143
CHUNK SIZE 2000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 10%      0.769649            0.9406           0.655261   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.783333           0.878579            0.544252  0.761946  
Number of chunks for chunk size 2000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.7625637949047773
CHUNK SIZE 2000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 15%      0.784939          0.955299           0.652623   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.75           0.889133            0.543388  0.762564  
Number of chunks for chunk size 2000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.7426846399383767
CHUNK SIZE 2000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 20%      0.762311          0.944377           0.582052   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.75           0.874847            0.542521  0.742685  
Number of chunks for chunk size 3000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.7569364199232673
CHUNK SIZE 3000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 0%      0.779172          0.949059           0.652841   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.75           0.878084            0.532462  0.756936  
Number of chunks for chunk size 3000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.7520124121631414
CHUNK SIZE 3000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 5%      0.767551          0.942964           0.617704   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.783333           0.868084            0.532438  0.752012  
Number of chunks for chunk size 3000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-1453' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-1454' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-1455' coro=<AsyncClient.aclose() done, defi

The average score is: 0.7549361695598794
CHUNK SIZE 3000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 10%      0.765109          0.949812            0.61915   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.783333           0.878084            0.534129  0.754936  
Number of chunks for chunk size 3000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.7622618233196597
CHUNK SIZE 3000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 15%      0.531527          0.796436           0.912243   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.752622           0.764752            0.815992  0.762262  
Number of chunks for chunk size 3000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.7425948215364686
CHUNK SIZE 3000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 20%      0.509508          0.751365           0.939013   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.654342           0.742385            0.858955  0.742595  


chunk = 1000

In [None]:
# # THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
# chunks_1000 = text_splitter.split_documents(documents)
# print(len(chunks_1000))
# db_1000 = Chroma.from_documents(chunks_1000, embeddings_client, persist_directory = "../ballad/vectordb/recursive_1000")
# db_1000.persist()

In [11]:
# db_1000 = Chroma(persist_directory = "../ballad/vectordb/recursive_1000", embedding_function=embeddings_client)

In [51]:
# retriever_1000 = db_1000.as_retriever()
# result_1000 = run_and_evaluate(retriever_1000, prompt, llm)
# print("CHUNK SIZE 1000")
# print(result_1000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.39it/s]


CHUNK SIZE 1000
{'faithfulness': 0.9429, 'answer_relevancy': 0.8294, 'context_precision': 0.5889, 'context_recall': 0.8208, 'answer_similarity': 0.8868, 'answer_correctness': 0.4487}


In [52]:
# dict_result_1000 = dictionary(result_1000)

The average score is: 0.752912607504265


chunk = 500

In [33]:
# # THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
# chunks_500 = text_splitter.split_documents(documents)
# print(len(chunks_500))
# db_500 = Chroma.from_documents(chunks_500, embeddings_client, persist_directory = "../ballad/vectordb/recursive_500")
# db_500.persist()

107


In [53]:
# db_500 = Chroma(persist_directory = "../ballad/vectordb/recursive_500", embedding_function=embeddings_client)

In [54]:
# retriever_500 = db_500.as_retriever()
# result_500 = run_and_evaluate(retriever_500, prompt, llm)
# print("CHUNK SIZE 500")
# print(result_500)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-333' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_transports\default.py", line 385, in aclose
    await self._pool.aclose()
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packa

CHUNK SIZE 500
{'faithfulness': 0.6822, 'answer_relevancy': 0.7488, 'context_precision': 0.5967, 'context_recall': 0.7615, 'answer_similarity': 0.8450, 'answer_correctness': 0.6514}


In [55]:
# dict_result_500 = dictionary(result_500)

The average score is: 0.7142606903567085


chunk = 2000

In [35]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 200)
# chunks_2000 = text_splitter.split_documents(documents)
# print(len(chunks_2000))
# db_2000 = Chroma.from_documents(chunks_2000, embeddings_client, persist_directory = "../ballad/vectordb/recursive_2000")


37


In [56]:
# db_2000 = Chroma(persist_directory = "../ballad/vectordb/recursive_2000", embedding_function=embeddings_client)

In [57]:
# retriever_2000 = db_2000.as_retriever()
# result_2000 = run_and_evaluate(retriever_2000, prompt, llm)
# print("CHUNK SIZE 2000")
# print(result_2000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.50it/s]


CHUNK SIZE 2000
{'faithfulness': 0.7667, 'answer_relevancy': 0.8230, 'context_precision': 0.5972, 'context_recall': 0.8208, 'answer_similarity': 0.8832, 'answer_correctness': 0.5073}


In [58]:
# dict_result_2000 = dictionary(result_2000)

The average score is: 0.7330396049761241


chunk = 3000

In [38]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 3000, chunk_overlap = 300)
# chunks_3000 = text_splitter.split_documents(documents)
# print(len(chunks_3000))
# db_3000 = Chroma.from_documents(chunks_3000, embeddings_client, persist_directory = "../ballad/vectordb/recursive_3000")

32


In [59]:
# db_3000 = Chroma(persist_directory = "../ballad/vectordb/recursive_3000", embedding_function=embeddings_client)

In [60]:
# retriever_3000 = db_3000.as_retriever()
# result_3000 = run_and_evaluate(retriever_3000, prompt, llm)
# print("CHUNK SIZE 3000")
# print(result_3000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.05it/s]


CHUNK SIZE 3000
{'faithfulness': 0.8595, 'answer_relevancy': 0.8386, 'context_precision': 0.6333, 'context_recall': 0.8208, 'answer_similarity': 0.8843, 'answer_correctness': 0.4686}


In [61]:
# dict_result_3000 = dictionary(result_3000)

The average score is: 0.7508713801108652


In [29]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.379673,,,0.841376,0.410613,0.66361
1,GPT-4,1.0,0.695899,,,0.900098,0.492424,0.742388
2,Naive,0.944444,0.831993,0.597222,0.820833,0.878165,0.41308,0.747623
3,Recursive,0.944444,0.837124,0.597222,0.771429,0.878165,0.402142,0.738421
4,"Chunk 500, overlap 0%",0.532729,0.803469,0.907841,0.752864,0.731083,0.795908,0.753982
5,"Chunk 500, overlap 5%",0.92,0.829388,0.597222,0.808333,0.882359,0.441887,0.746532
6,"Chunk 500, overlap 10%",0.531527,0.775393,0.907932,0.753544,0.731083,0.802336,0.750303
7,"Chunk 500, overlap 15%",0.800333,0.951028,0.616525,0.783333,0.890919,0.541697,0.763972
8,"Chunk 500, overlap 20%",0.816002,0.950323,0.616575,0.75,0.890744,0.532502,0.759358
9,"Chunk 1000, overlap 0%",0.790545,0.932472,0.654176,0.783333,0.890744,0.533308,0.764096


In [30]:
highest_average = results_df["Average"].max()
print("Highest average value:", highest_average)

Highest average value: 0.764096212592404


In [32]:
results_df.to_csv(f"../ballad/results/results_summarize.csv", index=False)

### now time to look for different top-k

Note: We continue with the size chunk of 1000 as it had the highest average score

In [33]:
db_k = Chroma(persist_directory = "../ballad/vectordb-edit/chunking_1000_0", embedding_function=embeddings_client)

In [35]:
k_values = [2, 3, 5, 6, 7]

# Iterate over different k values
for k in k_values:
    # Create retriever with k value
    retriever = db_k.as_retriever(search_kwargs={"k": k})
    
    # Run and evaluate
    result,results_df = run_and_evaluate(f"Chunk size 1000, overlap 0%, K={k}", retriever, prompt, llm, results_df)
    print(f"Results for K={k}:")
    print(result)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-1954' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-1955' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-1956' coro=<AsyncClient.aclose() done, defi

The average score is: 0.781972560978867
This is the new best value!
Results for K=7:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 3000, overlap 20%, K=7      0.911223          0.825997   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.635716        0.884008           0.913225            0.521667   

    Average  
0  0.781973  


In [63]:
# retriever_3 = db_1000.as_retriever(search_kwargs={"k": 3})
# result_3 = run_and_evaluate(retriever_3, prompt, llm)
# print("CHUNK SIZE 1000, K=3")
# print(result_3)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  18%|█▊        | 11/60 [00:02<00:06,  7.10it/s]Runner in Executor raised an exception
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 58, in _aresults
    r = await future
        ^^^^^^^^^^^^
  File "C:\Users\sigitalapina\AppData\Local\Programs\Python\Python311\Lib\asyncio\tasks.py", line 605, in _wait_for_one
    return f.result()  # May raise f.exception().
           ^^^^^^^^^^
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 91, in wrapped_callable_async
    return counter, await callable(*args, **kwargs)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sigitalapin

CHUNK SIZE 1000, K=3
{'faithfulness': 0.5797, 'answer_relevancy': 0.7975, 'context_precision': 0.5393, 'context_recall': 0.6875, 'answer_similarity': 0.7113, 'answer_correctness': 0.7793}


In [64]:
# dict_result_3 = dictionary(result_3)

The average score is: 0.6824532277442156


In [65]:
# retriever_5 = db_1000.as_retriever(search_kwargs={"k": 5})
# result_5 = run_and_evaluate(retriever_5, prompt, llm)
# print("CHUNK SIZE 1000, K=5")
# print(result_5)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.14it/s]


CHUNK SIZE 1000, K=5
{'faithfulness': 0.8000, 'answer_relevancy': 0.8256, 'context_precision': 0.5815, 'context_recall': 0.7800, 'answer_similarity': 0.8859, 'answer_correctness': 0.4240}


In [66]:
# dict_result_5 = dictionary(result_5)

The average score is: 0.7161788734282134


In [67]:
# retriever_6= db_1000.as_retriever(search_kwargs={"k": 6})
# result_6 = run_and_evaluate(retriever_6, prompt, llm)
# print("CHUNK SIZE 1000, K=5")
# print(result_6)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.92it/s]


CHUNK SIZE 1000, K=5
{'faithfulness': 0.8256, 'answer_relevancy': 0.8338, 'context_precision': 0.5782, 'context_recall': 0.8800, 'answer_similarity': 0.8907, 'answer_correctness': 0.5280}


In [68]:
# dict_result_6 = dictionary(result_6)

The average score is: 0.7560586606303001


In [69]:
# retriever_7= db_1000.as_retriever(search_kwargs={"k": 7})
# result_7 = run_and_evaluate(retriever_7, prompt, llm)
# print("CHUNK SIZE 1000, K=7")
# print(result_7)
# dict_result_7 = dictionary(result_7)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.49it/s]


CHUNK SIZE 1000, K=7
{'faithfulness': 0.9429, 'answer_relevancy': 0.8166, 'context_precision': 0.5914, 'context_recall': 0.7633, 'answer_similarity': 0.8919, 'answer_correctness': 0.5029}
The average score is: 0.751507469027492


In [36]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.379673,,,0.841376,0.410613,0.66361
1,GPT-4,1.0,0.695899,,,0.900098,0.492424,0.742388
2,Naive,0.944444,0.831993,0.597222,0.820833,0.878165,0.41308,0.747623
3,Recursive,0.944444,0.837124,0.597222,0.771429,0.878165,0.402142,0.738421
4,"Chunk 500, overlap 0%",0.532729,0.803469,0.907841,0.752864,0.731083,0.795908,0.753982
5,"Chunk 500, overlap 5%",0.92,0.829388,0.597222,0.808333,0.882359,0.441887,0.746532
6,"Chunk 500, overlap 10%",0.531527,0.775393,0.907932,0.753544,0.731083,0.802336,0.750303
7,"Chunk 500, overlap 15%",0.800333,0.951028,0.616525,0.783333,0.890919,0.541697,0.763972
8,"Chunk 500, overlap 20%",0.816002,0.950323,0.616575,0.75,0.890744,0.532502,0.759358
9,"Chunk 1000, overlap 0%",0.790545,0.932472,0.654176,0.783333,0.890744,0.533308,0.764096


### look for different retrievers

6 chunks was the best score

#### parent document retriever

In [37]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

parent_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap = 200)
child_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents",persist_directory = "../ballad/vectordb-edit/parent-summary", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
parent_document_retriever.add_documents(documents)
result_parent, results_df = run_and_evaluate(f"Parent Retriever 1000-200", parent_document_retriever, prompt, llm, results_df)
print(result_parent)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  7.13it/s]


The average score is: 0.7190440237375887
                      System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 1000-200      0.733333           0.75333   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.741667        0.726355           0.891473            0.468106   

    Average  
0  0.719044  


In [38]:
parent_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap = 50)
child_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_small",persist_directory = "../ballad/vectordb-edit/parent_small-summary", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_small = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_small,
    parent_splitter=parent_splitter_small,
)
parent_document_retriever_small.add_documents(documents)
result_parent_small, results_df = run_and_evaluate(f"Parent Retriever 500-100", parent_document_retriever_small, prompt, llm, results_df)
print(result_parent_small)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-2077' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-2078' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The resp

The average score is: 0.7334064668789292
                     System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 500-100      0.820649          0.835459   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.570873        0.727381           0.905459            0.540618   

    Average  
0  0.733406  


In [39]:
parent_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1500, chunk_overlap = 150)
child_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_large",persist_directory = "../ballad/vectordb-edit/parent_large-summary", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_large = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_large,
    parent_splitter=parent_splitter_large,
)
parent_document_retriever_large.add_documents(documents)
result_parent_large , results_df = run_and_evaluate(f"Parent Retriever 1500-200", parent_document_retriever_small, prompt, llm, results_df)
print(result_parent_large)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': True, 'severity': 'medium'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 

The average score is: 0.7352336975497553
                      System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 1500-200      0.817811           0.83528   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0            0.57652        0.735714           0.905459            0.540618   

    Average  
0  0.735234  


In [40]:
highest_average = results_df["Average"].max()
print("Highest average value:", highest_average)

Highest average value: 0.781972560978867


In [41]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.379673,,,0.841376,0.410613,0.66361
1,GPT-4,1.0,0.695899,,,0.900098,0.492424,0.742388
2,Naive,0.944444,0.831993,0.597222,0.820833,0.878165,0.41308,0.747623
3,Recursive,0.944444,0.837124,0.597222,0.771429,0.878165,0.402142,0.738421
4,"Chunk 500, overlap 0%",0.532729,0.803469,0.907841,0.752864,0.731083,0.795908,0.753982
5,"Chunk 500, overlap 5%",0.92,0.829388,0.597222,0.808333,0.882359,0.441887,0.746532
6,"Chunk 500, overlap 10%",0.531527,0.775393,0.907932,0.753544,0.731083,0.802336,0.750303
7,"Chunk 500, overlap 15%",0.800333,0.951028,0.616525,0.783333,0.890919,0.541697,0.763972
8,"Chunk 500, overlap 20%",0.816002,0.950323,0.616575,0.75,0.890744,0.532502,0.759358
9,"Chunk 1000, overlap 0%",0.790545,0.932472,0.654176,0.783333,0.890744,0.533308,0.764096


#### Maximum marginal relevance retrieval

In [42]:
retriever_mmr = db_k.as_retriever(search_type="mmr",search_kwargs={"k": 7})
result_mmr, results_df = run_and_evaluate(f"MMR", retriever_mmr, prompt, llm, results_df)
print("Marginal relevance")
print(result_mmr)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  87%|████████▋ | 52/60 [00:13<00:04,  1.95it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': True, 'severity': 'medium'}, 'violence': {'filtered': False, 'se

The average score is: 0.7462071896171293
Marginal relevance
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0    MMR      0.478823          0.873469           0.826085        0.699968   

   Answer Similarity  Answer Correctness   Average  
0            0.75447            0.844428  0.746207  


In [43]:
replacement_map = {
    'Chunk size 3000, overlap 20%, K=2': 'Chunk size 1000, 0%, K=2',
    'Chunk size 3000, overlap 20%, K=3': 'Chunk size 1000, 0%, K=3',
    'Chunk size 3000, overlap 20%, K=5': 'Chunk size 1000, 0%, K=5',
    'Chunk size 3000, overlap 20%, K=6': 'Chunk size 1000, 0%, K=6',
    'Chunk size 3000, overlap 20%, K=7': 'Chunk size 1000, 0%, K=7'
}
results_df['System'] = results_df['System'].replace(replacement_map)

In [44]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.379673,,,0.841376,0.410613,0.66361
1,GPT-4,1.0,0.695899,,,0.900098,0.492424,0.742388
2,Naive,0.944444,0.831993,0.597222,0.820833,0.878165,0.41308,0.747623
3,Recursive,0.944444,0.837124,0.597222,0.771429,0.878165,0.402142,0.738421
4,"Chunk 500, overlap 0%",0.532729,0.803469,0.907841,0.752864,0.731083,0.795908,0.753982
5,"Chunk 500, overlap 5%",0.92,0.829388,0.597222,0.808333,0.882359,0.441887,0.746532
6,"Chunk 500, overlap 10%",0.531527,0.775393,0.907932,0.753544,0.731083,0.802336,0.750303
7,"Chunk 500, overlap 15%",0.800333,0.951028,0.616525,0.783333,0.890919,0.541697,0.763972
8,"Chunk 500, overlap 20%",0.816002,0.950323,0.616575,0.75,0.890744,0.532502,0.759358
9,"Chunk 1000, overlap 0%",0.790545,0.932472,0.654176,0.783333,0.890744,0.533308,0.764096


#### BM25

In [45]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 0)
chunks_1000 = text_splitter.split_documents(documents)

In [47]:
from langchain.retrievers import BM25Retriever
retriever_bm25 = BM25Retriever.from_documents(chunks_1000, search_kwargs={"k": 7})
result_bm25, results_df = run_and_evaluate(f"BM25", retriever_bm25, prompt, llm, results_df)
print("BM25")
print(result_bm25)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.5880465269428691
BM25
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0   BM25       0.47394          0.406398           0.661178        0.505556   

   Answer Similarity  Answer Correctness   Average  
0           0.722256            0.758951  0.588047  


In [49]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.379673,,,0.841376,0.410613,0.66361
1,GPT-4,1.0,0.695899,,,0.900098,0.492424,0.742388
2,Naive,0.944444,0.831993,0.597222,0.820833,0.878165,0.41308,0.747623
3,Recursive,0.944444,0.837124,0.597222,0.771429,0.878165,0.402142,0.738421
4,"Chunk 500, overlap 0%",0.532729,0.803469,0.907841,0.752864,0.731083,0.795908,0.753982
5,"Chunk 500, overlap 5%",0.92,0.829388,0.597222,0.808333,0.882359,0.441887,0.746532
6,"Chunk 500, overlap 10%",0.531527,0.775393,0.907932,0.753544,0.731083,0.802336,0.750303
7,"Chunk 500, overlap 15%",0.800333,0.951028,0.616525,0.783333,0.890919,0.541697,0.763972
8,"Chunk 500, overlap 20%",0.816002,0.950323,0.616575,0.75,0.890744,0.532502,0.759358
9,"Chunk 1000, overlap 0%",0.790545,0.932472,0.654176,0.783333,0.890744,0.533308,0.764096


In [50]:
results_df.drop(33,inplace = True)

In [51]:
highest_average = results_df["Average"].max()
print("Highest average value:", highest_average)

Highest average value: 0.781972560978867


#### Ensambler - Hybrid

In [52]:
retriever_7= db_k.as_retriever(search_kwargs={"k": 7})

In [53]:
from langchain.retrievers import EnsembleRetriever
ensemble_retriever_1 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_7], weights=[0.75, 0.25])
result_ensemble1, results_df = run_and_evaluate(f"Ensambler 1", ensemble_retriever_1, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble1)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-2391' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-2392' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-2393' coro=<AsyncClient.aclose() done, defi

The average score is: 0.7161630214461998
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 1      0.505687          0.842514           0.708968   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0         0.61043           0.784137            0.845242  0.716163  


In [54]:
ensemble_retriever_2 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_7], weights=[0.5, 0.5])
result_ensemble2, results_df = run_and_evaluate(f"Ensambler 2", ensemble_retriever_2, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.99it/s]


The average score is: 0.7449569324055525
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 2        0.9125          0.821099           0.517745   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.785714            0.90548            0.527203  0.744957  


In [55]:
ensemble_retriever_3 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_7], weights=[0.25,0.75])
result_ensemble3, results_df = run_and_evaluate(f"Ensambler 3", ensemble_retriever_3, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

The average score is: 0.7648733901009135
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 3      0.896586          0.909414           0.523145   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0         0.97619            0.62381            0.660094  0.764873  


#### Multi-stage - reranker

In [20]:
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [22]:
retriever_context = retriever_mmr
compressor = CohereRerank(top_n=6)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever_context
)

result_compression = run_and_evaluate(compression_retriever, prompt, llm)
print("Reranker")
print(result_compression)
avg_result_compression = dictionary(result_compression)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  60%|██████    | 36/60 [00:06<00:02,  9.59it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.51it/s]


Reranker
{'faithfulness': 0.7556, 'answer_relevancy': 0.6267, 'context_precision': 0.6985, 'context_recall': 0.9000, 'answer_similarity': 0.9001, 'answer_correctness': 0.5243}
The average score is: 0.7341855512490597


#### creating context by remaking the query

In [56]:
template_context = "Generate a search query to fetch the relevant documents using the user's {question}. Craft a query that specifically targets the keywords in the question. In the answer provide only the query."
prompt_context = ChatPromptTemplate.from_template(template_context)

In [57]:
highest_average = results_df["Average"].max()
print("Highest average value:", highest_average)

Highest average value: 0.781972560978867


In [59]:
answers_final = []
contexts_final = []
# retriever_context_q = EnsembleRetriever(retrievers=[retriever_bm25, retriever_3], weights=[0.5, 0.5])
llm_for_context =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm}
)
for query in questions:
    response_check = llm_for_context.invoke({"question": query})
    search_query = response_check["response"].content
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
)
    docs = retriever_7.get_relevant_documents(search_query)
    formatted_docs = []
    for doc in docs:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final.append(response["response"].content)
            contexts_final.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final.append("No answer")
            contexts_final.append(formatted_docs)


result_search_query = evaluation_rag(questions, answers_final, contexts_final, ground_truths)
result_search_query

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  25%|██▌       | 15/60 [00:04<00:11,  3.79it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'seve

{'faithfulness': 0.7412, 'answer_relevancy': 0.7963, 'context_precision': 0.4538, 'context_recall': 0.9988, 'answer_similarity': 0.6126, 'answer_correctness': 0.7061}

In [60]:
average = dictionary(result_search_query)
    # Create a dictionary to store the results
system_results = {
        "System": "Search query",
        "Faithfulness": result_search_query["faithfulness"],
        "Answer Relevancy": result_search_query["answer_relevancy"],
        "Context Precision": result_search_query["context_precision"],
        "Context Recall": result_search_query["context_recall"],
        "Answer Similarity": result_search_query["answer_similarity"],
        "Answer Correctness": result_search_query["answer_correctness"],
        "Average": average
    }
df_result_search_query = pd.DataFrame([system_results])

The average score is: 0.7181310369478822


In [61]:
results_df = pd.concat([results_df, df_result_search_query], ignore_index=True)
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.379673,,,0.841376,0.410613,0.66361
1,GPT-4,1.0,0.695899,,,0.900098,0.492424,0.742388
2,Naive,0.944444,0.831993,0.597222,0.820833,0.878165,0.41308,0.747623
3,Recursive,0.944444,0.837124,0.597222,0.771429,0.878165,0.402142,0.738421
4,"Chunk 500, overlap 0%",0.532729,0.803469,0.907841,0.752864,0.731083,0.795908,0.753982
5,"Chunk 500, overlap 5%",0.92,0.829388,0.597222,0.808333,0.882359,0.441887,0.746532
6,"Chunk 500, overlap 10%",0.531527,0.775393,0.907932,0.753544,0.731083,0.802336,0.750303
7,"Chunk 500, overlap 15%",0.800333,0.951028,0.616525,0.783333,0.890919,0.541697,0.763972
8,"Chunk 500, overlap 20%",0.816002,0.950323,0.616575,0.75,0.890744,0.532502,0.759358
9,"Chunk 1000, overlap 0%",0.790545,0.932472,0.654176,0.783333,0.890744,0.533308,0.764096


### change model to GPT-4

In [62]:
top_3_highest = results_df.nlargest(3, "Average")
top_3_highest

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
28,"Chunk size 1000, 0%, K=7",0.911223,0.825997,0.635716,0.884008,0.913225,0.521667,0.781973
27,"Chunk size 1000, 0%, K=6",0.93006,0.933761,0.594889,0.737576,0.897546,0.498981,0.765469
36,Ensambler 3,0.896586,0.909414,0.523145,0.97619,0.62381,0.660094,0.764873


In [65]:
results_df.to_csv(f"../ballad/results/results_summarize.csv")

In [66]:
result_k7_1000_0_gpt4, results_df = run_and_evaluate(f"Chunk 1000, overlap 0%, GPT-4", retriever_7, prompt, llm_gpt4, results_df)
result_k7_1000_0_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.65it/s]


The average score is: 0.7738937121554766


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk 1000, overlap 0%, GPT-4",0.725556,0.913338,0.608115,0.913333,0.890853,0.592167,0.773894


In [67]:
retriever6 =  db_k.as_retriever(search_kwargs={"k": 6})
result_k6_1000_0_gpt4, results_df = run_and_evaluate(f"Chunk 1000, overlap 0%, GPT-4", retriever6, prompt, llm_gpt4, results_df)
result_k6_1000_0_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  35%|███▌      | 21/60 [00:05<00:07,  5.11it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'seve

The average score is: 0.7504103152560165


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk 1000, overlap 0%, GPT-4",0.548149,0.806642,0.799038,0.714333,0.784592,0.849707,0.75041


In [68]:
result_ensemble3_gpt4, results_df = run_and_evaluate(f"Ensambler 3, GPT-4", ensemble_retriever_3, prompt, llm_gpt4, results_df)
result_ensemble3_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

The average score is: 0.7955151771937771
This is the new best value!


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Ensambler 3, GPT-4",0.904894,0.602332,0.892218,0.888205,0.677399,0.808044,0.795515


In [69]:
results_df.to_csv(f"../ballad/results/results_summarize.csv")