In [1]:
import dotenv
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings
dotenv.load_dotenv()
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness,
)
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
import os
from langchain.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
import getpass
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever
import sys
sys.tracebacklimit = 0
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_community.document_loaders import PyPDFLoader
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL")
EMBEDDING_DEPLOYMENT = os.environ.get("EMBEDDING_DEPLOYMENT")
OPENAI_MODEL_GPT4 = os.environ.get("OPENAI_MODEL_GPT4")
OPENAI_DEPLOYMENT_GPT4 = os.environ.get("OPENAI_DEPLOYMENT_GPT4")

In [3]:
embeddings_client = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_version=OPENAI_API_VERSION)
llm = AzureChatOpenAI(model_name=OPENAI_MODEL, azure_deployment=OPENAI_DEPLOYMENT,temperature=0)
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [4]:
def evaluation_llm(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
    )
    return result

In [34]:
def evaluation_rag(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  # most likely
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

In [5]:
questions = [
    "Analyze the following quote from Dr. Volumnia Gaul from 'Ballad of Songbirds and Snakes': 'There is a point to everything or nothing at all, depending on your worldview.'",
    "Analyze the following quote from 'Ballad of Songbirds and Snakes': 'He'd loved the unfamiliar sense of safety that their defeat had brought. The security that could only come with power. The ability to control things. Yes, that was what he'd loved best of all.'",
    "Analyze this quote from 'Ballad of Songbirds and Snakes': 'Snow lands on top.'",
    "Analyze this quote from 'Ballad of Songbirds and Snakes': '...at eighteen, the heir to the once-great house of Snow had nothing to live on but his wits.'",
    "Analyze this quote from 'Ballad of Songbirds and Snakes': 'Bombs and blood. That was how the rebels had killed his mother. He wondered if they had killed Lucy Gray's, too. 'Just like her pearly white bones.' She seemed to have no love for District 12, always separating herself from it, saying she was, what was it . . . Covey?'",
    "Analyze this quote from 'Ballad of Songbirds and Snakes': 'All right, so he'd dropped the handkerchief with Lucy Gray's scent - the one from the outside pocket of his book bag - into the snake tank. He'd done it so they would not bite her as they had Clemensia. So they would not kill her. Because he cared about her. Because he cared about her? Or because he wanted to win the Hunger Games so that he could secure the Plinth Prize. If it was the latter, he had cheated to win, and that was that.'",
    "Analyze this quote from 'Ballad of Songbirds and Snakes': 'The Hunger Games are an unnatural, vicious punishment. How could a good person like you be expected to go along with them?'",
    "Analyze this quote from 'Ballad of Songbirds and Snakes': 'If I'm helping to kill people in the districts, how is it any better than helping to kill them in the Hunger Games?'",
    "Analyze this quote from 'Ballad of Songbirds and Snakes': 'You don't think I've invested all this time in you to hand you off to those imbeciles in the districts, do you?'"
]


ground_truths = [
    ["Throughout The Ballad, Dr. Gaul presents Coryo and the other students with items of political philosophy, often in order to explain her own views of power and control. Here, her contrast between a 'point' and 'nothing' plays into a few of her larger ideas. Authoritarian though she is, Dr. Gaul sees the power of the Capitol as serving a useful point in rescuing Panem from chaos. On another level, the situation that provokes this comment - Coryo and Clemensia observing Dr. Gaul's multicolored snakes and wondering if there is a 'point' to the coloration - reveals that the idea of purpose extends well beyond vast political constructs in the novel. Dr. Gaul's rainbow snakes serve a direct point in making the Hunger Games a stark spectacle, and they serve a narrative point in connecting Lucy with one of her symbolic animals in the arena. It is clear what worldview Dr. Gaul would endorse."],
    ["Here, Coryo is completing an assignment for Dr. Gaul on the rather twisted topic of what he liked about the war, and he is proving just how compatible Dr. Gaul's authoritarian worldview is with his own. Of course, Coryo's reasons for disliking the Districts are not purely sociopathic. The warfare caused his family real suffering and gave him the impression that, despite his apparent intelligence and his potential for decency, he was, for a time, fundamentally helpless. Still, is it reassuring 'security' or maniacal 'power' that really appeals to Coriolanus? On the basis of the later events of his life, his desire for control and power becomes twisted beyond understandable self-preservation and into open villainy."],
    ["This is something of a Snow family motto for Tigris and Coryo: the 'saying that had gotten them through the war' (Ballad, 9), as the two cousins recollect early in the novel. Though loaded with reassurance and a pleasing sense of inevitability, this mantra varies in tone, context, and implication as The Ballad of Songbirds and Snakes progresses. 'Snow lands on top' appears early on, when Tigris is contemplating her family's poverty and Snow is eager to excel as a mentor. These are inspiring words for difficult circumstances, but by the final time the motto appears—in the Epilogue, with Coryo victorious and increasingly vicious—a Snow has in fact landed on top and has succeeded in crushing plenty of other people."],
    ["Here, we are told that Snow is only eighteen, and we are led to contemplate the differences between the wealthy dictator of the later Hunger Games trilogy and the earnest young Coryo we see here. His kindly mother and his domineering father are dead, and he doesn't have their fortune—or any fortune—to benefit him, since his family's resources were lost in the rebellion with the presumed destruction of District 13. Now, Snow must use his wits and cunning to escape his situation. Thus, the book sets up Coryo's need to restore his family's name as one of the character's central motivations. Readers of the final trilogy may know the end result of Snow's life, but the task of connecting this impoverished young man to the well-heeled, sociopathic President Snow offers plenty of intrigue."],
    ["Is the bond between Snow and Lucy mainly based on self-interest or on genuine affection? While a romance between an aristocrat from the Capitol and a wandering musician may seem to have a shaky foundation, Snow's early reflections indicate that he feels some sympathy for Lucy. Their shared experience of the war is one of loneliness and displacement. Still, as Snow's need to excel takes precedence, his connection to Lucy becomes more complicated. The idea that Lucy has 'no love for District 12,' for instance, will later be deployed by Snow in discussions and interviews meant to appeal to Capitol sponsors. Of course, construing Lucy as non-District may help her popularity and her odds of winning, but it is impossible to separate Snow's bond with Lucy from his own determination to improve his life."],
    ["The narration of The Ballad is by no means indirect about the moral dilemmas that Coryo encounters. Here, the narrative explicitly raises the question of whether he desires Lucy's safety or a personal win more, underscoring this theme with sentences that are identical but for a single unit of punctuation: 'Because he cared about her. Because he cared about her?' At this point in the novel, it is possible to read a single action as doubled-sided and Coryo's confusion as genuine. By the end, however, he will decisively choose his future over Lucy, reaching a level of self-interest that he doesn't definitively attain in this scene."],
    ["There are a few Capitol citizens who object to the Hunger Games in a principled manner, though not all of them speak out as sharply as Sejanus Plinth does. Among the mentors, Lysistrata Vickers seems to harbor real compassion for her tribute, and Coriolanus himself is troubled - at least for a time - by the fact that he kills a tribute while escaping the arena. Tigris's own objections are, from one perspective, surprising. Though related to a Capitol loyalist (her grandmother) and a future authoritarian (her cousin), she finds the games miserable and believes that Coryo does the same. Her judgment of Coriolanus may be flawed and her idea may not quite fit her family's sympathies."],
    ["In part, Sejanus's decision to become a Peacekeeper was meant to remove him from a Capitol milieu that he never found appealing. Nonetheless, this quotation reveals that Sejanus is in an impossible position. If becoming a Peacekeeper - ideally, a Peacekeeper doctor or medic - was meant as a form of escape and a route to atonement, then Sejanus's new approach has led him down the wrong path. He is forced to help an institution that executes rebels without trial and that, for the most part, is willing to let the Districts remain in disrepair - beyond providing the Capitol with resources, that is."],
    ["Here, Dr. Gaul reveals that, in her own supremely self-interested way, she holds Coriolanus in high regard. He is a fantastic investment, and his time in District 12 has proven that he can thrive even in conditions of deprivation and uncertainty despite his Capitol pedigree. Dr. Gaul's own Capitol elitism is on display in her reference to the 'imbeciles' from the Districts. How much, however, did Coryo really learn from his time in the Districts? His stint as a Peacemaker is construed as a necessary toughening in The Ballad, but the events of the later Hunger Games trilogy suggest that Coriolanus comes to underestimate those District 'imbeciles' at his own peril."]
   ]

In [25]:
columns = ["System", "Faithfulness", "Answer Relevancy", "Context Precision", "Context Recall", "Answer Similarity", "Answer Correctness"]
results_df = pd.DataFrame(columns=columns)

In [7]:
max_average = 0
def find_highest(average_score):
    global max_average
    if average_score > max_average:
        max_average = average_score
        print("This is the new best value!")

In [8]:
def dictionary(result):
    dict_result = dict(result)
    average_score = sum(dict_result.values()) / len(dict_result)
    print(f"The average score is: {average_score}")
    find_highest(average_score)
    return average_score

In [35]:
def evaluate_system(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_rag(questions, answers, contexts, ground_truths)
    average = dictionary(result)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Faithfulness": result["faithfulness"],
        "Answer Relevancy": result["answer_relevancy"],
        "Context Precision": result["context_precision"],
        "Context Recall": result["context_recall"],
        "Answer Similarity": result["answer_similarity"],
        "Answer Correctness": result["answer_correctness"],
        "Average": average
    }
    df_system_results = pd.DataFrame([system_results])
    return df_system_results

In [19]:
def evaluate_LLM(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_llm(questions, answers, contexts, ground_truths)
    average = dictionary(result)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Faithfulness": result["faithfulness"],
        "Answer Relevancy": result["answer_relevancy"],
        "Context Precision": np.nan,
        "Context Recall": np.nan,
        "Answer Similarity": result["answer_similarity"],
        "Answer Correctness": result["answer_correctness"],
        "Average": average
    }
    df_llm_results = pd.DataFrame([system_results])
    return df_llm_results

### General answer by LLMs

In [10]:
template = """{question}"""
prompt = ChatPromptTemplate.from_template(template)

In [11]:
llm_chain =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm}
)
llm_chain_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm_gpt4}
)

In [12]:
answers_llm = []
contexts_llm = [[""],[""],[""],[""],[""],[""],[""],[""],[""]]

In [13]:
for query in questions:
    response = llm_chain.invoke({"question": query})
    answers_llm.append(response["response"].content)

In [26]:
llm_results = evaluate_LLM("GPT-3.5", questions, answers_llm, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results], ignore_index=True)
print(llm_results)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  56%|█████▌    | 20/36 [00:03<00:01,  9.56it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 36/36 [00:06<00:00,  5.67it/s]


The average score is: 0.810784095434097
    System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  GPT-3.5      0.971429          0.745413                NaN             NaN   

   Answer Similarity  Answer Correctness   Average  
0           0.866096            0.660199  0.810784  


  results_df = pd.concat([results_df, llm_results], ignore_index=True)


In [68]:
answers_llm_gpt4 = []
for query in questions:
    response = llm_chain_gpt4.invoke({"question": query})
    answers_llm_gpt4.append(response["response"].content)
llm_results_gpt4 = evaluate_LLM("GPT-4",questions, answers_llm_gpt4, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results_gpt4], ignore_index=True)
print(llm_results_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 36/36 [00:06<00:00,  5.51it/s]


The average score is: 0.8307891316627263
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  GPT-4           1.0          0.783206                NaN             NaN   

   Answer Similarity  Answer Correctness   Average  
0           0.861252            0.678698  0.830789  


# RAG

### Naive RAG

In [28]:
def retrieval_chain(prompt, retriever, llm):
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
    )
    return retrieval_augmented_qa_chain

In [29]:
loader = PyPDFLoader(r"..\ballad\the_ballad_of_songbirds_and_snakes.pdf")
documents = loader.load()

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = CharacterTextSplitter()
chunks = text_splitter.split_documents(documents)
db_naive = Chroma.from_documents(chunks, embeddings_client, persist_directory = "../ballad/vectordb/naive")
db_naive.persist()
retriever_naive = db_naive.as_retriever()

In [30]:
db_naive = Chroma(persist_directory = "../ballad/vectordb-edit/naive", embedding_function=embeddings_client)
retriever_naive = db_naive.as_retriever()

In [31]:
template = """User input {question}. 
Context {context}."""
prompt = ChatPromptTemplate.from_template(template)

In [32]:
answers_naive = []
contexts_naive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_naive, llm).invoke({"question": query})
        # Access the response content
        answers_naive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_naive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_naive.append("No answer")
        context_full = retriever_naive.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_naive.append(context_content)



In [36]:
result_naive_rag = evaluate_system("Naive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_naive_rag], ignore_index=True)
print(result_naive_rag)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.7021937943559574
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  Naive      0.881621          0.726491             0.5125        0.693275   

   Answer Similarity  Answer Correctness   Average  
0           0.683565             0.71571  0.702194  


### recursive splitter

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
chunks_r = text_splitter.split_documents(documents)
db_basic = Chroma.from_documents(chunks_r, embeddings_client, persist_directory = "../ballad/vectordb/recursive_basic")
db_basic.persist()
retriever_basic = db_basic.as_retriever()

In [37]:
db_basic = Chroma(persist_directory = "../ballad/vectordb/recursive_basic", embedding_function=embeddings_client)
retriever_basic = db_basic.as_retriever()

In [38]:
answers_recursive = []
contexts_recursive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_basic, llm).invoke({"question": query})
        # Access the response content
        answers_recursive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_recursive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_recursive.append("No answer")
        context_full = retriever_basic.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_recursive.append(context_content)



In [39]:
result_recursive = evaluate_system("Recursive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_recursive], ignore_index=True)
result_recursive

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-164' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-165' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The respon

The average score is: 0.7157690662921116


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,Recursive,0.865996,0.729346,0.5125,0.580707,0.816281,0.789784,0.715769


### chunk sizes

In [40]:
def run_and_evaluate(name, retriever, prompt, llm, results_df):
    answers = []
    contexts_extra = []

    for query in questions:
        try:  
            response = retrieval_chain(prompt, retriever, llm).invoke({"question": query})
            # Access the response content
            answers.append(response["response"].content)
            # Access the context content
            context_content = [context.page_content for context in response["context"]]
            contexts_extra.append(context_content)  
        except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers.append("No answer")
            context_full = retriever.get_relevant_documents(query)
            context_content = [context.page_content for context in context_full]
            contexts_extra.append(context_content)

    result = evaluate_system(name, questions, answers, contexts_extra, ground_truths)
    results_df = pd.concat([results_df, result], ignore_index=True)
    return result, results_df

In [41]:
chunk_sizes = [500, 1000, 2000, 3000]
overlap_percentages = [0, 5, 10, 15, 20]

for chunk_size in chunk_sizes:
    for overlap_percentage in overlap_percentages:
        # Calculate overlap based on percentage
        chunk_overlap = int(chunk_size * overlap_percentage / 100)
        
        # Create text splitter
        # text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        
        # # Split documents
        # chunks = text_splitter.split_documents(documents)
        
        # Print number of chunks
        print(f"Number of chunks for chunk size {chunk_size}, overlap {overlap_percentage}%")
        
        # Create Chroma database
        # db = Chroma.from_documents(chunks, embeddings_client, persist_directory=f"../ballad/vectordb-edit/chunking_{chunk_size}_{overlap_percentage}")
        db = Chroma(persist_directory=f"../ballad/vectordb-edit/chunking_{chunk_size}_{overlap_percentage}", embedding_function=embeddings_client)
        # db.persist()
        
        # Create retriever
        retriever = db.as_retriever()
        
        # Run and evaluate
        result,results_df = run_and_evaluate(f"Chunk {chunk_size}, overlap {overlap_percentage}%", retriever, prompt, llm, results_df)
        print(f"CHUNK SIZE {chunk_size}, {overlap_percentage}% overlap:")
        print(result)

Number of chunks for chunk size 500, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:07<00:00,  7.56it/s]


The average score is: 0.6824418951653323
CHUNK SIZE 500, 0% overlap:
                  System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 0%      0.866667           0.88476           0.376543   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.435185           0.866883            0.664614  0.682442  
Number of chunks for chunk size 500, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:07<00:00,  7.69it/s]


The average score is: 0.7011332254990136
CHUNK SIZE 500, 5% overlap:
                  System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 5%      0.680952          0.901854           0.441358   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.666667           0.863189             0.65278  0.701133  
Number of chunks for chunk size 500, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:07<00:00,  6.79it/s]


The average score is: 0.7091465704405379
CHUNK SIZE 500, 10% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 10%           0.8          0.885184           0.450617   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.592593           0.863726             0.66276  0.709147  
Number of chunks for chunk size 500, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:06<00:00,  8.08it/s]


The average score is: 0.7791517963321956
CHUNK SIZE 500, 15% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 15%       0.82619          0.879043           0.645062   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.824074           0.869306            0.631235  0.779152  
Number of chunks for chunk size 500, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:07<00:00,  7.69it/s]


The average score is: 0.709873706222197
CHUNK SIZE 500, 20% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 20%      0.714286          0.903706           0.506173   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.675926           0.872395            0.586756  0.709874  
Number of chunks for chunk size 1000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.7142748275123901
CHUNK SIZE 1000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 0%      0.834282          0.723396             0.4875   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.689591           0.780951            0.769929  0.714275  
Number of chunks for chunk size 1000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.6929915529118457
CHUNK SIZE 1000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 5%      0.859269          0.710329           0.520833   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.567662            0.78902            0.710835  0.692992  
Number of chunks for chunk size 1000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.6843498324662227
CHUNK SIZE 1000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 10%      0.833369          0.724186           0.472222   

   Context Recall  Answer Similarity  Answer Correctness  Average  
0        0.635688           0.677939            0.762695  0.68435  
Number of chunks for chunk size 1000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.6877679727146745
CHUNK SIZE 1000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 15%      0.809964          0.719308           0.519048   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.558069           0.735682            0.784538  0.687768  
Number of chunks for chunk size 1000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.6682921671909154
CHUNK SIZE 1000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 20%      0.808816          0.718937           0.380952   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.538053           0.763482            0.799512  0.668292  
Number of chunks for chunk size 2000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.6527207248465835
CHUNK SIZE 2000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 0%      0.788936          0.747135           0.470833   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.693106            0.66052            0.555794  0.652721  
Number of chunks for chunk size 2000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.6827169705822705
CHUNK SIZE 2000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 5%        0.8193          0.724491           0.537037   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.653868           0.765334            0.596272  0.682717  
Number of chunks for chunk size 2000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.6891334308860277
CHUNK SIZE 2000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 10%      0.894395          0.730272           0.470833   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.560752           0.678642            0.799907  0.689133  
Number of chunks for chunk size 2000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.6680273033133194
CHUNK SIZE 2000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 15%      0.762022          0.723838           0.442857   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.579288           0.733706            0.766451  0.668027  
Number of chunks for chunk size 2000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.6936143090952968
CHUNK SIZE 2000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 20%      0.859328            0.7308           0.470833   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.675626           0.699462            0.725637  0.693614  
Number of chunks for chunk size 3000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-1058' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-1059' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-1060' coro=<AsyncClient.aclose() done, defi

The average score is: 0.7434476339150485
CHUNK SIZE 3000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 0%      0.864944          0.729709             0.6375   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.689309           0.733664            0.805561  0.743448  
Number of chunks for chunk size 3000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.68596410701437
CHUNK SIZE 3000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 5%       0.86207          0.747021           0.464286   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.729848           0.670983            0.641577  0.685964  
Number of chunks for chunk size 3000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.6816862158841568
CHUNK SIZE 3000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 10%      0.863684          0.711061                0.5   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.579188           0.677831            0.758354  0.681686  
Number of chunks for chunk size 3000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.6558806901737344
CHUNK SIZE 3000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 15%      0.884277          0.744819           0.414583   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.577723           0.736018            0.577863  0.655881  
Number of chunks for chunk size 3000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.7006815116686758
CHUNK SIZE 3000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 20%      0.877549          0.747584           0.458333   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.747531            0.72432            0.648772  0.700682  


chunk = 1000

In [None]:
# # THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
# chunks_1000 = text_splitter.split_documents(documents)
# print(len(chunks_1000))
# db_1000 = Chroma.from_documents(chunks_1000, embeddings_client, persist_directory = "../ballad/vectordb/recursive_1000")
# db_1000.persist()

In [28]:
# db_1000 = Chroma(persist_directory = "../ballad/vectordb/recursive_1000", embedding_function=embeddings_client)

In [29]:
# retriever_1000 = db_1000.as_retriever()
# result_1000 = run_and_evaluate(retriever_1000, prompt, llm)
# print("CHUNK SIZE 1000")
# print(result_1000)
# avg_result_1000, dict_result_1000 = dictionary(result_1000)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

CHUNK SIZE 1000
{'faithfulness': 0.8073, 'answer_relevancy': 0.7830, 'context_precision': 0.4722, 'context_recall': 0.7878, 'answer_similarity': 0.6856, 'answer_correctness': 0.8029}
The average score is: 0.7231345250780993


chunk = 500

In [33]:
# # THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
# chunks_500 = text_splitter.split_documents(documents)
# print(len(chunks_500))
# db_500 = Chroma.from_documents(chunks_500, embeddings_client, persist_directory = "../ballad/vectordb/recursive_500")
# db_500.persist()

107


In [30]:
# db_500 = Chroma(persist_directory = "../ballad/vectordb/recursive_500", embedding_function=embeddings_client)

In [31]:
# retriever_500 = db_500.as_retriever()
# result_500 = run_and_evaluate(retriever_500, prompt, llm)
# print("CHUNK SIZE 500")
# print(result_500)
# avg_result_500, dict_result_500 = dictionary(result_500)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-202' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-203' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-204' coro=<AsyncClient.aclose() done, defined

CHUNK SIZE 500
{'faithfulness': 0.8163, 'answer_relevancy': 0.9034, 'context_precision': 0.4012, 'context_recall': 0.6481, 'answer_similarity': 0.8669, 'answer_correctness': 0.6844}
The average score is: 0.720067425291858


chunk = 2000

In [35]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 200)
# chunks_2000 = text_splitter.split_documents(documents)
# print(len(chunks_2000))
# db_2000 = Chroma.from_documents(chunks_2000, embeddings_client, persist_directory = "../ballad/vectordb/recursive_2000")


37


In [32]:
# db_2000 = Chroma(persist_directory = "../ballad/vectordb/recursive_2000", embedding_function=embeddings_client)

In [33]:
# retriever_2000 = db_2000.as_retriever()
# result_2000 = run_and_evaluate(retriever_2000, prompt, llm)
# print("CHUNK SIZE 2000")
# print(result_2000)
# avg_result_2000, dict_result_2000 = dictionary(result_2000)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

CHUNK SIZE 2000
{'faithfulness': 0.8064, 'answer_relevancy': 0.7431, 'context_precision': 0.4389, 'context_recall': 0.6734, 'answer_similarity': 0.6855, 'answer_correctness': 0.7563}
The average score is: 0.6839296969751175


chunk = 3000

In [38]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 3000, chunk_overlap = 300)
# chunks_3000 = text_splitter.split_documents(documents)
# print(len(chunks_3000))
# db_3000 = Chroma.from_documents(chunks_3000, embeddings_client, persist_directory = "../ballad/vectordb/recursive_3000")

32


In [34]:
# db_3000 = Chroma(persist_directory = "../ballad/vectordb/recursive_3000", embedding_function=embeddings_client)

In [35]:
# retriever_3000 = db_3000.as_retriever()
# result_3000 = run_and_evaluate(retriever_3000, prompt, llm)
# print("CHUNK SIZE 3000")
# print(result_3000)
# avg_result_3000 ,dict_result_3000 = dictionary(result_3000)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

CHUNK SIZE 3000
{'faithfulness': 0.8087, 'answer_relevancy': 0.7442, 'context_precision': 0.5190, 'context_recall': 0.6554, 'answer_similarity': 0.7109, 'answer_correctness': 0.8236}
The average score is: 0.7103154417058323


In [42]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.971429,0.745413,,,0.866096,0.660199,0.810784
1,GPT-4,0.942857,0.884441,,,0.860459,0.654768,0.835631
2,Naive,0.881621,0.726491,0.5125,0.693275,0.683565,0.71571,0.702194
3,Recursive,0.865996,0.729346,0.5125,0.580707,0.816281,0.789784,0.715769
4,"Chunk 500, overlap 0%",0.866667,0.88476,0.376543,0.435185,0.866883,0.664614,0.682442
5,"Chunk 500, overlap 5%",0.680952,0.901854,0.441358,0.666667,0.863189,0.65278,0.701133
6,"Chunk 500, overlap 10%",0.8,0.885184,0.450617,0.592593,0.863726,0.66276,0.709147
7,"Chunk 500, overlap 15%",0.82619,0.879043,0.645062,0.824074,0.869306,0.631235,0.779152
8,"Chunk 500, overlap 20%",0.714286,0.903706,0.506173,0.675926,0.872395,0.586756,0.709874
9,"Chunk 1000, overlap 0%",0.834282,0.723396,0.4875,0.689591,0.780951,0.769929,0.714275


In [29]:
results_df.to_csv(f"../ballad/results/results_report.csv", index=False)

In [43]:
highest_average = results_df.nlargest(3, "Average")
highest_average

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
1,GPT-4,0.942857,0.884441,,,0.860459,0.654768,0.835631
0,GPT-3.5,0.971429,0.745413,,,0.866096,0.660199,0.810784
7,"Chunk 500, overlap 15%",0.82619,0.879043,0.645062,0.824074,0.869306,0.631235,0.779152


### now time to look for different top-k

Note: We continue with Chunk 3000, overlap 10%

In [44]:
db_k = Chroma(persist_directory = "../ballad/vectordb-edit/chunking_500_15", embedding_function=embeddings_client)

In [45]:
k_values = [2, 3, 5, 6, 7]

# Iterate over different k values
for k in k_values:
    # Create retriever with k value
    retriever = db_k.as_retriever(search_kwargs={"k": k})
    
    # Run and evaluate
    result,results_df = run_and_evaluate(f"Chunk size 500, overlap 15%, K={k}", retriever, prompt, llm, results_df)
    print(f"Results for K={k}:")
    print(result)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:06<00:00,  7.77it/s]


The average score is: 0.7578063194974834
Results for K=2:
                             System  Faithfulness  Answer Relevancy  \
0  Chunk size 500, overlap 15%, K=2          0.74          0.880395   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.666667         0.72963           0.870647              0.6595   

    Average  
0  0.757806  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'low'}, 'violence': {'filtered': True, 'severity': 'me

The average score is: 0.7246070182988021
Results for K=3:
                             System  Faithfulness  Answer Relevancy  \
0  Chunk size 500, overlap 15%, K=3      0.750577          0.806049   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.775391        0.453704           0.804792             0.75713   

    Average  
0  0.724607  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:07<00:00,  6.95it/s]


The average score is: 0.8265952620225997
Results for K=5:
                             System  Faithfulness  Answer Relevancy  \
0  Chunk size 500, overlap 15%, K=5      0.966667          0.905295   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.636883        0.944444           0.874627            0.631657   

    Average  
0  0.826595  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:06<00:00,  7.76it/s]


The average score is: 0.7564957837489547
Results for K=6:
                             System  Faithfulness  Answer Relevancy  \
0  Chunk size 500, overlap 15%, K=6      0.792857          0.897322   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.681821        0.675926           0.866473            0.624575   

    Average  
0  0.756496  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  19%|█▊        | 10/54 [00:03<00:13,  3.25it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'seve

The average score is: 0.7512490419021468
Results for K=7:
                             System  Faithfulness  Answer Relevancy  \
0  Chunk size 500, overlap 15%, K=7      0.693972          0.747306   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.844508        0.625705           0.824804              0.7712   

    Average  
0  0.751249  


In [36]:
# retriever_3 = db_1000.as_retriever(search_kwargs={"k": 3})
# result_3 = run_and_evaluate(retriever_3, prompt, llm)
# print("CHUNK SIZE 1000, K=3")
# print(result_3)
# avg_result_3, dict_result_3 = dictionary(result_3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:09<00:00,  5.72it/s]


CHUNK SIZE 1000, K=3
{'faithfulness': 0.8200, 'answer_relevancy': 0.9060, 'context_precision': 0.5370, 'context_recall': 0.7963, 'answer_similarity': 0.8722, 'answer_correctness': 0.6900}
The average score is: 0.7702549797617143


In [37]:
# retriever_5 = db_1000.as_retriever(search_kwargs={"k": 5})
# result_5 = run_and_evaluate(retriever_5, prompt, llm)
# print("CHUNK SIZE 1000, K=5")
# print(result_5)
# avg_result_5, dict_result_5 = dictionary(result_5)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:10<00:00,  5.31it/s]


CHUNK SIZE 1000, K=5
{'faithfulness': 0.8667, 'answer_relevancy': 0.8925, 'context_precision': 0.4636, 'context_recall': 0.6204, 'answer_similarity': 0.8748, 'answer_correctness': 0.6645}
The average score is: 0.7303917310800102


In [38]:
# retriever_6= db_1000.as_retriever(search_kwargs={"k": 6})
# result_6 = run_and_evaluate(retriever_6, prompt, llm)
# print("CHUNK SIZE 1000, K=5")
# print(result_6)
# avg_result_6, dict_result_6 = dictionary(result_6)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:10<00:00,  5.29it/s]


CHUNK SIZE 1000, K=5
{'faithfulness': 0.8667, 'answer_relevancy': 0.9038, 'context_precision': 0.4434, 'context_recall': 0.6481, 'answer_similarity': 0.8671, 'answer_correctness': 0.6306}
The average score is: 0.7266041952543404


In [39]:
# retriever_7= db_1000.as_retriever(search_kwargs={"k": 7})
# result_7 = run_and_evaluate(retriever_7, prompt, llm)
# print("CHUNK SIZE 1000, K=7")
# print(result_7)
# avg_result_7, dict_result_7 = dictionary(result_7)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

CHUNK SIZE 1000, K=7
{'faithfulness': 0.9253, 'answer_relevancy': 0.7778, 'context_precision': 0.4403, 'context_recall': 0.8073, 'answer_similarity': 0.7465, 'answer_correctness': 0.5476}
The average score is: 0.7074589251052892


In [41]:
# retriever_2 = db_1000.as_retriever(search_kwargs={"k": 2})
# result_2 = run_and_evaluate(retriever_2, prompt, llm)
# print("CHUNK SIZE 1000, K=2")
# print(result_2)
# avg_result_2, dict_result_2 = dictionary(result_2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:11<00:00,  4.87it/s]


CHUNK SIZE 1000, K=2
{'faithfulness': 0.6167, 'answer_relevancy': 0.8891, 'context_precision': 0.5000, 'context_recall': 0.7778, 'answer_similarity': 0.8708, 'answer_correctness': 0.6751}
The average score is: 0.7215756009304392


### look for different retrievers


#### parent document retriever

In [47]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

parent_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap = 200)
child_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents",persist_directory = "../ballad/vectordb-edit/parent-report", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
    search_kwargs = ({"k": 5})
)
parent_document_retriever.add_documents(documents)
result_parent, results_df = run_and_evaluate(f"Parent Retriever 1000-200", parent_document_retriever, prompt, llm, results_df)
print(result_parent)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:06<00:00,  8.95it/s]


The average score is: 0.6465194256933376
                      System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 1000-200       0.73125          0.582884   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.444444        0.685185           0.862021            0.573332   

    Average  
0  0.646519  


In [48]:
parent_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap = 50)
child_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_small",persist_directory = "../ballad/vectordb-edit/parent_small-report", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_small = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_small,
    parent_splitter=parent_splitter_small,
    search_kwargs = ({"k": 5})
)
parent_document_retriever_small.add_documents(documents)
result_parent_small, results_df = run_and_evaluate(f"Parent Retriever 500-100", parent_document_retriever_small, prompt, llm, results_df)
print(result_parent_small)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': True, 'severity': 'medium'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 

The average score is: 0.7753608442569068
                     System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 500-100      0.815535          0.880237   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.719369        0.648148           0.850717            0.738159   

    Average  
0  0.775361  


In [49]:
parent_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1500, chunk_overlap = 150)
child_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_large",persist_directory = "../ballad/vectordb-edit/parent_large-report", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_large = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_large,
    parent_splitter=parent_splitter_large,
    search_kwargs = ({"k": 5})
)
parent_document_retriever_large.add_documents(documents)
result_parent_large , results_df = run_and_evaluate(f"Parent Retriever 1500-200", parent_document_retriever_large, prompt, llm, results_df)
print(result_parent_large)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:07<00:00,  7.66it/s]


The average score is: 0.7104080808180977
                      System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 1500-200         0.775           0.79208   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.444444        0.768519           0.869522            0.612883   

    Average  
0  0.710408  


#### Maximum marginal relevance retrieval

In [51]:
retriever_mmr = db_k.as_retriever(search_type="mmr",search_kwargs = ({"k": 5}))
result_mmr, results_df = run_and_evaluate(f"MMR", retriever_mmr, prompt, llm, results_df)
print("Marginal relevance")
print(result_mmr)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:07<00:00,  7.57it/s]


The average score is: 0.8071934659779852
Marginal relevance
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0    MMR      0.792857          0.898714           0.738735        0.888889   

   Answer Similarity  Answer Correctness   Average  
0           0.874188            0.649778  0.807193  


#### BM25

In [53]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 75)
chunks_500 = text_splitter.split_documents(documents)

In [54]:
retriever_bm25 = BM25Retriever.from_documents(chunks_500,search_kwargs = ({"k": 5}))
result_bm25, results_df = run_and_evaluate(f"BM25", retriever_bm25, prompt, llm, results_df)
print("BM25")
print(result_bm25)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   2%|▏         | 1/54 [00:01<01:15,  1.43s/it]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severi

The average score is: 0.7419127592786244
BM25
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0   BM25      0.851285          0.577793           0.761905        0.880525   

   Answer Similarity  Answer Correctness   Average  
0           0.600237            0.779732  0.741913  


#### Ensambler - Hybrid

In [55]:
retriever_500 = db_k.as_retriever(search_kwargs = ({"k": 5}))

In [56]:
ensemble_retriever_1 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_500], weights=[0.75, 0.25])
result_ensemble1, results_df = run_and_evaluate(f"Ensambler 1", ensemble_retriever_1, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble1)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:07<00:00,  7.24it/s]


The average score is: 0.778723471794867
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 1        0.8625          0.896181           0.641499   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.712963           0.883615            0.675582  0.778723  


In [57]:
ensemble_retriever_2 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_500], weights=[0.5, 0.5])
result_ensemble2, results_df = run_and_evaluate(f"Ensambler 2", ensemble_retriever_2, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:07<00:00,  7.16it/s]


The average score is: 0.8248493896414025
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 2      0.911111          0.899975           0.631411   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.944444           0.883797            0.678357  0.824849  


In [58]:
ensemble_retriever_3 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_500], weights=[0.25,0.75])
result_ensemble3, results_df = run_and_evaluate(f"Ensambler 3", ensemble_retriever_3, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble3)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   2%|▏         | 1/54 [00:01<01:19,  1.50s/it]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severi

The average score is: 0.769417606058456
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 3       0.82858          0.589649              0.975   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.784893            0.55125            0.887134  0.769418  


#### Multi-stage - reranker

In [54]:
# os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [55]:
# retriever_context = retriever_3
# compressor = CohereRerank(top_n = 5)
# compression_retriever = ContextualCompressionRetriever(
#     base_compressor=compressor, base_retriever=retriever_context
# )

# result_compression = run_and_evaluate(compression_retriever, prompt, llm)
# print("Reranker")
# print(result_compression)
# avg_result_compression, dict_result_compression = dictionary(result_compression)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:08<00:00,  6.67it/s]


Reranker
{'faithfulness': 0.8286, 'answer_relevancy': 0.8882, 'context_precision': 0.5185, 'context_recall': 0.7407, 'answer_similarity': 0.8663, 'answer_correctness': 0.6374}
The average score is: 0.7466056282196497


In [56]:
# retriever_context_more = retriever_500
# compressor = CohereRerank(top_n = 5)
# compression_retriever_more = ContextualCompressionRetriever(
#     base_compressor=compressor, base_retriever=retriever_context_more
# )

# result_compression_more = run_and_evaluate(compression_retriever_more, prompt, llm)
# print("Reranker")
# print(result_compression_more)
# avg_result_compression_more, dict_result_compression_more = dictionary(result_compression_more)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

Reranker
{'faithfulness': 0.6468, 'answer_relevancy': 0.8313, 'context_precision': 0.6104, 'context_recall': 0.5648, 'answer_similarity': 0.8625, 'answer_correctness': 0.7789}
The average score is: 0.7157937819391315


#### creating context by remaking the query

In [60]:
template_context = "Generate a search query to fetch the relevant documents using the user's {question}. Craft a query that specifically targets the keywords in the question. In the answer provide only the query."
prompt_context = ChatPromptTemplate.from_template(template_context)

In [61]:
answers_final = []
contexts_final = []
llm_for_context =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm}
)
for query in questions:
    response_check = llm_for_context.invoke({"question": query})
    search_query = response_check["response"].content
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
)
    docs = retriever_500.get_relevant_documents(search_query)
    formatted_docs = []
    for doc in docs:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final.append(response["response"].content)
            contexts_final.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final.append("No answer")
            contexts_final.append(formatted_docs)


result_search_query = evaluation_rag(questions, answers_final, contexts_final, ground_truths)
result_search_query



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/54 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': True, 'severity': 'medium'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 

{'faithfulness': 0.8266, 'answer_relevancy': 0.6130, 'context_precision': 0.5875, 'context_recall': 0.5302, 'answer_similarity': 0.7671, 'answer_correctness': 0.5118}

In [62]:
average = dictionary(result_search_query)
    # Create a dictionary to store the results
system_results = {
        "System": "Search query",
        "Faithfulness": result_search_query["faithfulness"],
        "Answer Relevancy": result_search_query["answer_relevancy"],
        "Context Precision": result_search_query["context_precision"],
        "Context Recall": result_search_query["context_recall"],
        "Answer Similarity": result_search_query["answer_similarity"],
        "Answer Correctness": result_search_query["answer_correctness"],
        "Average": average
    }
df_result_search_query = pd.DataFrame([system_results])
results_df = pd.concat([results_df, df_result_search_query], ignore_index=True)
results_df

The average score is: 0.6393715733935755


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.971429,0.745413,,,0.866096,0.660199,0.810784
1,GPT-4,0.942857,0.884441,,,0.860459,0.654768,0.835631
2,Naive,0.881621,0.726491,0.5125,0.693275,0.683565,0.71571,0.702194
3,Recursive,0.865996,0.729346,0.5125,0.580707,0.816281,0.789784,0.715769
4,"Chunk 500, overlap 0%",0.866667,0.88476,0.376543,0.435185,0.866883,0.664614,0.682442
5,"Chunk 500, overlap 5%",0.680952,0.901854,0.441358,0.666667,0.863189,0.65278,0.701133
6,"Chunk 500, overlap 10%",0.8,0.885184,0.450617,0.592593,0.863726,0.66276,0.709147
7,"Chunk 500, overlap 15%",0.82619,0.879043,0.645062,0.824074,0.869306,0.631235,0.779152
8,"Chunk 500, overlap 20%",0.714286,0.903706,0.506173,0.675926,0.872395,0.586756,0.709874
9,"Chunk 1000, overlap 0%",0.834282,0.723396,0.4875,0.689591,0.780951,0.769929,0.714275


### change model to GPT-4

In [64]:
results_retriever_500_gpt4, results_df = run_and_evaluate(f"Chunk size 500, overlap 15%, K=5- GPT4", retriever_500, prompt, llm_gpt4, results_df)
print(results_retriever_500_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:07<00:00,  7.50it/s]


The average score is: 0.8268215027275688
                                   System  Faithfulness  Answer Relevancy  \
0  Chunk size 500, overlap 15%, K=5- GPT4           1.0          0.745908   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.658642        0.944444           0.886735            0.725199   

    Average  
0  0.826822  


In [65]:
result_ensemble2_gpt4, results_df = run_and_evaluate(f"Ensambler 2, GPT-4", ensemble_retriever_2, prompt, llm_gpt4, results_df)
result_ensemble2_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  17%|█▋        | 9/54 [00:01<00:06,  6.45it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 54/54 [00:08<00:00,  6.42it/s]


The average score is: 0.8358661838530649
This is the new best value!


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Ensambler 2, GPT-4",0.947619,0.879338,0.689436,0.922222,0.892164,0.684419,0.835866


In [66]:
result_mmr_gpt4, results_df = run_and_evaluate(f"MMR GPT4", retriever_mmr, prompt, llm_gpt4, results_df)
print(result_mmr_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:32<00:00,  1.68it/s]


The average score is: 0.8482167245348319
This is the new best value!
     System  Faithfulness  Answer Relevancy  Context Precision  \
0  MMR GPT4           1.0          0.869917           0.738735   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.888889           0.871896            0.719863  0.848217  


In [70]:
results_df = results_df[results_df['System'] != 'GPT-4']
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.971429,0.745413,,,0.866096,0.660199,0.810784
2,Naive,0.881621,0.726491,0.5125,0.693275,0.683565,0.71571,0.702194
3,Recursive,0.865996,0.729346,0.5125,0.580707,0.816281,0.789784,0.715769
4,"Chunk 500, overlap 0%",0.866667,0.88476,0.376543,0.435185,0.866883,0.664614,0.682442
5,"Chunk 500, overlap 5%",0.680952,0.901854,0.441358,0.666667,0.863189,0.65278,0.701133
6,"Chunk 500, overlap 10%",0.8,0.885184,0.450617,0.592593,0.863726,0.66276,0.709147
7,"Chunk 500, overlap 15%",0.82619,0.879043,0.645062,0.824074,0.869306,0.631235,0.779152
8,"Chunk 500, overlap 20%",0.714286,0.903706,0.506173,0.675926,0.872395,0.586756,0.709874
9,"Chunk 1000, overlap 0%",0.834282,0.723396,0.4875,0.689591,0.780951,0.769929,0.714275
10,"Chunk 1000, overlap 5%",0.859269,0.710329,0.520833,0.567662,0.78902,0.710835,0.692992


In [67]:
results_df.to_csv(f"../ballad/results/results_report.csv", index=False)