In [1]:
import dotenv
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings
dotenv.load_dotenv()
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness,
)
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
import os
from langchain.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
import getpass
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_loaders import PyPDFLoader
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
import sys
sys.tracebacklimit = 0

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL")
EMBEDDING_DEPLOYMENT = os.environ.get("EMBEDDING_DEPLOYMENT")
OPENAI_MODEL_GPT4 = os.environ.get("OPENAI_MODEL_GPT4")
OPENAI_DEPLOYMENT_GPT4 = os.environ.get("OPENAI_DEPLOYMENT_GPT4")

In [3]:
embeddings_client = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_version=OPENAI_API_VERSION)
llm = AzureChatOpenAI(model_name=OPENAI_MODEL, azure_deployment=OPENAI_DEPLOYMENT,temperature=0)
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [4]:
def evaluation_llm(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
    )
    return result

In [5]:
def evaluation_rag(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

In [6]:
questions = [
    "Character analysis of Sejanus Plinth",
    "Character analysis of Lucy Gray",
    "Character analysis of Tigris Snow",
    "What are the places 1-10 in the 10th Hunger Games?",
    "How does Lucy Gray win the 10th Hunger games?",
    "How and Who Came Up with the Hunger Games?",
    "Why did Snow join the mentorship program in The Ballad of Songbirds and Snake?",
    "What idea does Snow present to the Head Game Maker Volumnia Gaul?",
    "What evidence does Snow secretly record and send to Dr. Gaul, and what is the result of that?",
    "How do Snow and Lucy gain popularity in the Capitol?"
]


ground_truths = [
    ["Sejanus is one of Coriolanus’s classmates and a mentor in the Hunger Games. Coriolanus and many of his classmates treat Sejanus coldly, as Sejanus isn’t Capitol-born. Rather, his father, a munitions magnate from District Two, bought his way into Capitol high society. Sejanus resents his father and hates the Capitol; like his mother, Ma, he still feels connected to District Two and considers the district home. Coriolanus gets drawn into Sejanus’s orbit when, during the reaping, Sejanus shares that his tribute from District Two, Marcus, is a former classmate. From this moment on, Coriolanus begins saving Sejanus from danger or embarrassment on many occasions. He takes on the role of Sejanus’s mentor, encouraging Sejanus to remain loyal to the Capitol and not step too far out of line in protesting the Games. However, Sejanus remains convinced that the Games are wrong, and he goes so far as to enter the arena on the first night. He expects the tributes to kill him and believes this will make a statement to the Capitol that the Games are inhumane. Sejanus ultimately agrees to leave the arena with Coriolanus and is unsuccessful in sending the message he intended. His actions, however, get him in trouble with the Capitol. Though Sejanus resents his father’s money, Strabo buys Sejanus’s way out of trouble and allows him to join the Peacekeepers. Sejanus goes into the Peacekeepers with characteristic optimism—he wants to train as a medic and help people. But when Sejanus learns he can’t be a medic if there’s no war, he instead falls in with rebels in District 12, helps plot an escape, and purchases guns for the rebel forces. He outright ignores Coriolanus’s attempts to keep him out of trouble—but Coriolanus does end up double-crossing Sejanus. He records Sejanus’s admission that he’s helping the rebels with a jabberjay, and Dr. Gaul eventually hears the message. Sejanus is ultimately executed for treason."],
    ["Lucy Gray is the female tribute from District 12. Lucy Gray proves herself to be a cutthroat performer when, at the reaping, she slips a snake down a girl’s dress and then sings a song onstage. She loves color—she usually wears a dress of rainbow ruffles—and often uses bird imagery when she speaks. As Coriolanus gets to know her, he learns that she’s an orphan. She’s a member of the Covey, a traveling band of musicians and performers that’s now permanently based in District 12. With this information, Coriolanus presents Lucy Gray as a tribute not from District 12, but as a person who’s more like people in the Capitol. This campaign is successful; Lucy Gray becomes the fan favorite. She and Coriolanus fall in love in the week before the Games, sharing a passionate kiss—and Coriolanus gives her his mother’s compact, both as a token of his affection and so she can sneak rat poison into the arena. Throughout the Games, Lucy Gray mostly stays hidden. She kills several people with rat poison and one person with one of the neon snakes Dr. Gaul drops in the arena. After she wins the Games, the Capitol sends her home, where she rejoins the Covey. She and Coriolanus attempt to keep their romance alive while Coriolanus is stationed there, but this becomes complicated. Many of Lucy Gray’s songs are about a former lover, Billy Taupe, whom she now hates but who is still in her life. However, when Lucy Gray starts to fear that the mayor (who incorrectly believes Lucy Gray killed his daughter) is going to hurt her, Lucy Gray and Coriolanus decide to run away. Lucy Gray prizes trust and friendship, so when she infers that Coriolanus is responsible for Sejanus’s death, she tries to run away. She sets a trap for Coriolanus that results in him being bitten by a snake. After this, Coriolanus shoots at her. He isn’t sure if he hits her, but he ultimately decides he doesn’t care. Though Lucy Gray mysteriously disappears after this, several songs she wrote persist—they appear 64 years later in the Hunger Games trilogy.Lucy Gray Baird Quotes in The Ballad of Songbir"],
    ["Tigris is Coriolanus’s cousin. She’s a few years older than Coriolanus and has been living with him and the Grandma’am since the war, when she was also orphaned. Tigris is kind, caring, and intelligent. Tigris has always taken it upon herself to look out for Coriolanus. This meant that she learned to cook as a young child during the war—but she also implies that at several points, she’s turned to sex work to make ends meet. Her dream has always been to work in fashion, which she’s doing in the novel’s present. Though she’s supposedly working as an apprentice, Coriolanus suggests that her employer treats her more as a grunt, making her do unsavory or dirty tasks rather than teaching her how to make clothes. Despite this, Tigris is resourceful and excels at upcycling old garments—she manages to transform one of Crassus Snow’s stained old shirts into a gorgeous, classy garment for Coriolanus to wear to the reaping. As the Hunger Games approach and begin, Tigris becomes increasingly skeptical of the Games and of Coriolanus’s involvement in them. Particularly as she develops sympathy for Lucy Gray—and later, after she realizes Dr. Gaul forced Coriolanus into the arena, putting him in danger—she expresses that the Games are wrong and not fair to any of the children involved, mentors or tributes."],
    ["1. Lucy Gray (victor), 2. Reaper, 3. Treech, 4. Teslee, 5. Mizzen, 6. Coral, 7. Circ, 8. Wovey, 9. Tanner, 10. Lamina"],
    ["Lucy Gray plays the long game once that year’s tournament begins. She spends the first day in hiding after running into the woods instead of picking from the weapons available, which ultimately leads to what is commonly known as the “blood bath.” She then uses water bottles, poison, and snakes to kill the other tributes, though she retains her humanity and comforts one of the tributes she kills by holding him and speaking to him softly. Lucy was, however, helped by her mentor from outside the arena—he slipped a handkerchief with Lucy’s scent into a tank of snakes that were later released into the Games. This meant that the snakes didn’t hunt Lucy, and she survived their arrival in the arena. In the end, after waiting it out and cleverly avoiding death, Lucy Gray is declared the winner of the 10th annual Hunger Games—a harrowing yet tremendous victory not only for her but Snow and her home, District 12, too."],
    ["One of the many tragic backstories in Ballad, Dean Casca Highbottom is the remorseful creator of the Hunger Games. He came up with the idea for a school project, but he was drunk and never meant for it to be put into practice. Unfortunately, his friend and partner for the project, Crassus Snow, submitted the idea to their teacher, Dr. Gaul, and, to Highbottom's horror, the Hunger Games were born. After his creation comes to life, Highbottom becomes a morphling addict."],
    ["Coriolanus Snow joined the mentorship program in 'The Ballad of Songbirds and Snakes' primarily motivated by his desire to restore his family's wealth and social standing. After losing both parents in the war, Snow's family becomes destitute, relying on food rations. Living in the Capitol with his grandmother and cousin Tigress, Snow sees the mentorship program as an opportunity to potentially win the cash prize and improve his family's dire situation. Despite initial reservations about mentoring the female tribute from District 12, the poorest district in Panem, Snow's motivation evolves as he develops a connection with Lucy Gray Baird. Over time, Snow's participation is influenced by his observations of the power dynamics and suffering within the Games, leading him to propose a betting and sponsorship program that shapes the future of the Hunger Games. The story unfolds as Snow's motivations become more complex, navigating personal ambition, family restoration, and a gradual realization of societal injustices within Panem."],
    ["Coriolanus Snow presents the idea of a betting and sponsorship program to the Head Game Maker Volumnia Gaul. Tasked with writing a paper about additions to the Hunger Games, Snow suggests this program to engage Capitol citizens in supporting their favorite tributes. He sees it as an opportunity to restore his family's wealth and secure his future. The concept adds a strategic layer to the Games, allowing tributes to attract sponsors by building alliances and performing well. Despite initial suspicions from Gaul, the idea proves successful, leading to increased sponsorships and support for Snow's tribute, Lucy Gray Baird. This idea becomes instrumental in the modern Hunger Games, marking a significant development in Snow's character as he navigates manipulation, moral dilemmas, and the power dynamics within Panem's society."],
    ["Coriolanus Snow secretly records and sends evidence to Dr. Gaul, the Head Game Maker, regarding Sejanus's plot to help District residents escape. Using a Jabberyjay, Snow captures Sejanus admitting to the plan and sends the message to Gaul. This information results in Sejanus's execution and the subsequent deaths of the Mayor's daughter and another citizen."],
    ["Coriolanus Snow and Lucy Gray Baird gain popularity in the Capitol through strategic actions and captivating performances. Despite their initial differences and the challenge of being paired together, Snow and Lucy find ways to captivate the citizens of the Capitol and become influential figures.One of the key strategies that Snow and Lucy employ to gain popularity is Lucy's talent as a singer and performer."]
    ]

### General answer by LLMs

In [16]:
def dictionary(result):
    dict_result = dict(result)
    average_score = sum(dict_result.values()) / len(dict_result)
    print(f"The average score is: {average_score}")
    return average_score

In [29]:
template = """{question}"""
prompt = ChatPromptTemplate.from_template(template)

In [30]:
llm_chain =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm}
)
llm_chain_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm_gpt4}
)

In [31]:
answers_llm = []
contexts_llm = [[""],[""],[""],[""],[""],[""],[""],[""],[""],[""]]

In [32]:
for query in questions:
    response = llm_chain.invoke({"question": query})
    answers_llm.append(response["response"].content)

In [33]:
llm_results = evaluation_llm(questions, answers_llm, contexts_llm, ground_truths)
print(llm_results)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 40/40 [00:08<00:00,  4.91it/s]


{'faithfulness': 0.9444, 'answer_relevancy': 0.3760, 'answer_similarity': 0.8413, 'answer_correctness': 0.3895}


In [34]:
dict_llm_results = dictionary(llm_results)

The average score is: 0.6377986113727302


In [35]:
answers_llm_gpt4 = []
for query in questions:
    response = llm_chain_gpt4.invoke({"question": query})
    answers_llm_gpt4.append(response["response"].content)
llm_results_gpt4 = evaluation_llm(questions, answers_llm_gpt4, contexts_llm, ground_truths)
print(llm_results_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 40/40 [00:11<00:00,  3.45it/s]


{'faithfulness': 0.9594, 'answer_relevancy': 0.8764, 'answer_similarity': 0.9042, 'answer_correctness': 0.5814}


In [36]:
dict_llm_results_gpt4 = dictionary(llm_results_gpt4)

The average score is: 0.8303702964930484


# RAG

### Naive RAG

In [7]:
def retrieval_chain(prompt, retriever, llm):
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
    )
    return retrieval_augmented_qa_chain

In [8]:
loader = PyPDFLoader(r"..\ballad\the_ballad_of_songbirds_and_snakes.pdf")
documents = loader.load()

In [39]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = CharacterTextSplitter()
chunks = text_splitter.split_documents(documents)
db_naive = Chroma.from_documents(chunks, embeddings_client, persist_directory = "../ballad/vectordb/naive")
db_naive.persist()
retriever_naive = db_naive.as_retriever()

In [40]:
db_naive = Chroma(persist_directory = "../ballad/vectordb/naive", embedding_function=embeddings_client)
retriever_naive = db_naive.as_retriever()

In [9]:
template = """{question}. 
Provided context {context}."""
prompt = ChatPromptTemplate.from_template(template)

In [42]:
answers_naive = []
contexts_naive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_naive, llm).invoke({"question": query})
        # Access the response content
        answers_naive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_naive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_naive.append("No answer")
        context_full = retriever_naive.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_naive.append(context_content)



In [43]:
result_naive_rag = evaluation_rag(questions, answers_naive, contexts_naive, ground_truths)
print(result_naive_rag)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  18%|█▊        | 11/60 [00:02<00:07,  6.52it/s]Runner in Executor raised an exception
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 58, in _aresults
    r = await future
        ^^^^^^^^^^^^
  File "C:\Users\sigitalapina\AppData\Local\Programs\Python\Python311\Lib\asyncio\tasks.py", line 605, in _wait_for_one
    return f.result()  # May raise f.exception().
           ^^^^^^^^^^
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 91, in wrapped_callable_async
    return counter, await callable(*args, **kwargs)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sigitalapin

{'faithfulness': 0.8179, 'answer_relevancy': 0.5780, 'context_precision': 0.5228, 'context_recall': 0.5726, 'answer_similarity': 0.7423, 'answer_correctness': 0.6498}


In [44]:
dict_result_naive_rag = dictionary(result_naive_rag)

The average score is: 0.6472394986419802


### recursive splitter

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
chunks_r = text_splitter.split_documents(documents)
db_basic = Chroma.from_documents(chunks_r, embeddings_client, persist_directory = "../ballad/vectordb/recursive_basic")
db_basic.persist()
retriever_basic = db_basic.as_retriever()

In [45]:
db_basic = Chroma(persist_directory = "../ballad/vectordb/recursive_basic", embedding_function=embeddings_client)
retriever_basic = db_basic.as_retriever()

In [46]:
answers_recursive = []
contexts_recursive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_basic, llm).invoke({"question": query})
        # Access the response content
        answers_recursive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_recursive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_recursive.append("No answer")
        context_full = retriever_basic.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_recursive.append(context_content)

In [47]:
result_recursive = evaluation_rag(questions, answers_recursive, contexts_recursive, ground_truths)
print(result_recursive)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.64it/s]


{'faithfulness': 0.8743, 'answer_relevancy': 0.8323, 'context_precision': 0.5972, 'context_recall': 0.7875, 'answer_similarity': 0.8838, 'answer_correctness': 0.4480}


In [48]:
dict_result_recursive = dictionary(result_recursive)

The average score is: 0.7371898552172951


### chunk sizes

In [10]:
def run_and_evaluate(retriever, prompt, llm):
    answers_recursive = []
    contexts_recursive = []

    for query in questions:
        try:  
            response = retrieval_chain(prompt, retriever, llm).invoke({"question": query})
            # Access the response content
            answers_recursive.append(response["response"].content)
            # Access the context content
            context_content = [context.page_content for context in response["context"]]
            contexts_recursive.append(context_content)  
        except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_recursive.append("No answer")
            context_full = retriever.get_relevant_documents(query)
            context_content = [context.page_content for context in context_full]
            contexts_recursive.append(context_content)


    result = evaluation_rag(questions, answers_recursive, contexts_recursive, ground_truths)
    return result

chunk = 1000

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
chunks_1000 = text_splitter.split_documents(documents)
print(len(chunks_1000))
db_1000 = Chroma.from_documents(chunks_1000, embeddings_client, persist_directory = "../ballad/vectordb/recursive_1000")
db_1000.persist()

In [11]:
db_1000 = Chroma(persist_directory = "../ballad/vectordb/recursive_1000", embedding_function=embeddings_client)

In [51]:
retriever_1000 = db_1000.as_retriever()
result_1000 = run_and_evaluate(retriever_1000, prompt, llm)
print("CHUNK SIZE 1000")
print(result_1000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.39it/s]


CHUNK SIZE 1000
{'faithfulness': 0.9429, 'answer_relevancy': 0.8294, 'context_precision': 0.5889, 'context_recall': 0.8208, 'answer_similarity': 0.8868, 'answer_correctness': 0.4487}


In [52]:
dict_result_1000 = dictionary(result_1000)

The average score is: 0.752912607504265


chunk = 500

In [33]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
chunks_500 = text_splitter.split_documents(documents)
print(len(chunks_500))
db_500 = Chroma.from_documents(chunks_500, embeddings_client, persist_directory = "../ballad/vectordb/recursive_500")
db_500.persist()

107


In [53]:
db_500 = Chroma(persist_directory = "../ballad/vectordb/recursive_500", embedding_function=embeddings_client)

In [54]:
retriever_500 = db_500.as_retriever()
result_500 = run_and_evaluate(retriever_500, prompt, llm)
print("CHUNK SIZE 500")
print(result_500)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-333' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_transports\default.py", line 385, in aclose
    await self._pool.aclose()
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packa

CHUNK SIZE 500
{'faithfulness': 0.6822, 'answer_relevancy': 0.7488, 'context_precision': 0.5967, 'context_recall': 0.7615, 'answer_similarity': 0.8450, 'answer_correctness': 0.6514}


In [55]:
dict_result_500 = dictionary(result_500)

The average score is: 0.7142606903567085


chunk = 2000

In [35]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 200)
chunks_2000 = text_splitter.split_documents(documents)
print(len(chunks_2000))
db_2000 = Chroma.from_documents(chunks_2000, embeddings_client, persist_directory = "../ballad/vectordb/recursive_2000")


37


In [56]:
db_2000 = Chroma(persist_directory = "../ballad/vectordb/recursive_2000", embedding_function=embeddings_client)

In [57]:
retriever_2000 = db_2000.as_retriever()
result_2000 = run_and_evaluate(retriever_2000, prompt, llm)
print("CHUNK SIZE 2000")
print(result_2000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.50it/s]


CHUNK SIZE 2000
{'faithfulness': 0.7667, 'answer_relevancy': 0.8230, 'context_precision': 0.5972, 'context_recall': 0.8208, 'answer_similarity': 0.8832, 'answer_correctness': 0.5073}


In [58]:
dict_result_2000 = dictionary(result_2000)

The average score is: 0.7330396049761241


chunk = 3000

In [38]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 3000, chunk_overlap = 300)
chunks_3000 = text_splitter.split_documents(documents)
print(len(chunks_3000))
db_3000 = Chroma.from_documents(chunks_3000, embeddings_client, persist_directory = "../ballad/vectordb/recursive_3000")

32


In [59]:
db_3000 = Chroma(persist_directory = "../ballad/vectordb/recursive_3000", embedding_function=embeddings_client)

In [60]:
retriever_3000 = db_3000.as_retriever()
result_3000 = run_and_evaluate(retriever_3000, prompt, llm)
print("CHUNK SIZE 3000")
print(result_3000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.05it/s]


CHUNK SIZE 3000
{'faithfulness': 0.8595, 'answer_relevancy': 0.8386, 'context_precision': 0.6333, 'context_recall': 0.8208, 'answer_similarity': 0.8843, 'answer_correctness': 0.4686}


In [61]:
dict_result_3000 = dictionary(result_3000)

The average score is: 0.7508713801108652


### now time to look for different top-k

Note: We continue with the size chunk of 1000 as it had the highest average score

In [63]:
retriever_3 = db_1000.as_retriever(search_kwargs={"k": 3})
result_3 = run_and_evaluate(retriever_3, prompt, llm)
print("CHUNK SIZE 1000, K=3")
print(result_3)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  18%|█▊        | 11/60 [00:02<00:06,  7.10it/s]Runner in Executor raised an exception
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 58, in _aresults
    r = await future
        ^^^^^^^^^^^^
  File "C:\Users\sigitalapina\AppData\Local\Programs\Python\Python311\Lib\asyncio\tasks.py", line 605, in _wait_for_one
    return f.result()  # May raise f.exception().
           ^^^^^^^^^^
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 91, in wrapped_callable_async
    return counter, await callable(*args, **kwargs)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sigitalapin

CHUNK SIZE 1000, K=3
{'faithfulness': 0.5797, 'answer_relevancy': 0.7975, 'context_precision': 0.5393, 'context_recall': 0.6875, 'answer_similarity': 0.7113, 'answer_correctness': 0.7793}


In [64]:
dict_result_3 = dictionary(result_3)

The average score is: 0.6824532277442156


In [65]:
retriever_5 = db_1000.as_retriever(search_kwargs={"k": 5})
result_5 = run_and_evaluate(retriever_5, prompt, llm)
print("CHUNK SIZE 1000, K=5")
print(result_5)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.14it/s]


CHUNK SIZE 1000, K=5
{'faithfulness': 0.8000, 'answer_relevancy': 0.8256, 'context_precision': 0.5815, 'context_recall': 0.7800, 'answer_similarity': 0.8859, 'answer_correctness': 0.4240}


In [66]:
dict_result_5 = dictionary(result_5)

The average score is: 0.7161788734282134


In [67]:
retriever_6= db_1000.as_retriever(search_kwargs={"k": 6})
result_6 = run_and_evaluate(retriever_6, prompt, llm)
print("CHUNK SIZE 1000, K=5")
print(result_6)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.92it/s]


CHUNK SIZE 1000, K=5
{'faithfulness': 0.8256, 'answer_relevancy': 0.8338, 'context_precision': 0.5782, 'context_recall': 0.8800, 'answer_similarity': 0.8907, 'answer_correctness': 0.5280}


In [68]:
dict_result_6 = dictionary(result_6)

The average score is: 0.7560586606303001


In [69]:
retriever_7= db_1000.as_retriever(search_kwargs={"k": 7})
result_7 = run_and_evaluate(retriever_7, prompt, llm)
print("CHUNK SIZE 1000, K=7")
print(result_7)
dict_result_7 = dictionary(result_7)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.49it/s]


CHUNK SIZE 1000, K=7
{'faithfulness': 0.9429, 'answer_relevancy': 0.8166, 'context_precision': 0.5914, 'context_recall': 0.7633, 'answer_similarity': 0.8919, 'answer_correctness': 0.5029}
The average score is: 0.751507469027492


### look for different retrievers

6 chunks was the best score

#### parent document retriever

In [16]:
parent_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

# vectorstore = Chroma(collection_name="split_parents", persist_directory = "../ballad/vectordb/parent_summary", embedding_function=embeddings_client)
# vectorstore.persist()
vectorstore = Chroma(persist_directory = "../ballad/vectordb/parent_summary", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
    search_kwargs={"k": 6}
)
parent_document_retriever.add_documents(documents)
result_parent = run_and_evaluate(parent_document_retriever, prompt, llm)
print(result_parent)
dict_result_parent = dictionary(result_parent)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.97it/s]


{'faithfulness': 0.8263, 'answer_relevancy': 0.7337, 'context_precision': 0.7000, 'context_recall': 0.8569, 'answer_similarity': 0.8923, 'answer_correctness': 0.4777}
The average score is: 0.7478190875052072


In [17]:
parent_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap = 50)
child_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap = 0)

# vectorstore = Chroma(collection_name="split_parents_small",persist_directory = "../ballad/vectordb/parent_small_summary", embedding_function=embeddings_client)
# vectorstore.persist()
vectorstore = Chroma(persist_directory = "../ballad/vectordb/parent_small_summary", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_small = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_small,
    parent_splitter=parent_splitter_small,
    search_kwargs={"k": 6}
)
parent_document_retriever_small.add_documents(documents)
result_parent_small = run_and_evaluate(parent_document_retriever_small, prompt, llm)
print(result_parent_small)
dict_result_parent_small = dictionary(result_parent_small)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

{'faithfulness': 0.7989, 'answer_relevancy': 0.4232, 'context_precision': 0.5833, 'context_recall': 0.5544, 'answer_similarity': 0.5782, 'answer_correctness': 0.5696}
The average score is: 0.5845850482989023


In [18]:
parent_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=3000)
child_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=300, chunk_overlap = 0)

# vectorstore = Chroma(collection_name="split_parents_large", persist_directory = "../ballad/vectordb/parent_large_summary", embedding_function=embeddings_client)
# vectorstore.persist()
vectorstore = Chroma(persist_directory = "../ballad/vectordb/parent_large_summary", embedding_function=embeddings_client)

store = InMemoryStore()
parent_document_retriever_large = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_large,
    parent_splitter=parent_splitter_large,
    search_kwargs={"k": 6}
)
parent_document_retriever_large.add_documents(documents)
result_parent_large = run_and_evaluate(parent_document_retriever_large, prompt, llm)
print(result_parent_large)
dict_result_parent_large = dictionary(result_parent_large)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.88it/s]


{'faithfulness': 0.7946, 'answer_relevancy': 0.6375, 'context_precision': 0.5542, 'context_recall': 0.7100, 'answer_similarity': 0.8907, 'answer_correctness': 0.4764}
The average score is: 0.6772374702311944


#### Maximum marginal relevance retrieval

In [19]:
retriever_mmr = db_1000.as_retriever(search_type="mmr",search_kwargs={"k": 6})
result_mmr = run_and_evaluate(retriever_mmr, prompt, llm)
print("Marginal relevance")
print(result_mmr)
average_score_mmr = dictionary(result_mmr)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.61it/s]


Marginal relevance
{'faithfulness': 0.8167, 'answer_relevancy': 0.8113, 'context_precision': 0.6381, 'context_recall': 0.9667, 'answer_similarity': 0.9001, 'answer_correctness': 0.5372}
The average score is: 0.7783108799399754


#### BM25

In [13]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
chunks_1000 = text_splitter.split_documents(documents)

In [21]:
retriever_bm25 = BM25Retriever.from_documents(chunks_1000)
result_bm25 = run_and_evaluate(retriever_bm25, prompt, llm)
print("BM25")
print(result_bm25)
average_score_bm25 = dictionary(result_bm25)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

BM25
{'faithfulness': 0.8168, 'answer_relevancy': 0.4753, 'context_precision': 0.8400, 'context_recall': 0.4261, 'answer_similarity': 0.5471, 'answer_correctness': 0.6587}
The average score is: 0.6273412574850488


#### Ensambler - Hybrid

In [22]:
retriever_6= db_1000.as_retriever(search_kwargs={"k": 6})

In [23]:
ensemble_retriever_1 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_6], weights=[0.75, 0.25])
result_ensemble1 = run_and_evaluate(ensemble_retriever_1, prompt, llm)
print("Ensambler 75/25")
print(result_ensemble1)
average_score_ensambler1 = dictionary(result_ensemble1)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.24it/s]


Ensambler 75/25
{'faithfulness': 0.9714, 'answer_relevancy': 0.7870, 'context_precision': 0.4741, 'context_recall': 0.7500, 'answer_similarity': 0.8999, 'answer_correctness': 0.4921}
The average score is: 0.7290911608987622


In [24]:
ensemble_retriever_2 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_6], weights=[0.5, 0.5])
result_ensemble2 = run_and_evaluate(ensemble_retriever_2, prompt, llm)
print("Ensambler 50/50")
print(result_ensemble2)
avg_result_ensemble2 = dictionary(result_ensemble2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.79it/s]


Ensambler 50/50
{'faithfulness': 0.9500, 'answer_relevancy': 0.7810, 'context_precision': 0.5291, 'context_recall': 0.8000, 'answer_similarity': 0.8974, 'answer_correctness': 0.4897}
The average score is: 0.7411992190195104


In [25]:
ensemble_retriever_3 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_6], weights=[0.25,0.75])
result_ensemble3 = run_and_evaluate(ensemble_retriever_3, prompt, llm)
print("Ensambler 25/75")
print(result_ensemble3)
avg_result_ensemble3 = dictionary(result_ensemble3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

In [15]:
ensemble_retriever_4 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_mmr], weights=[0.25,0.75])
result_ensemble4 = run_and_evaluate(ensemble_retriever_4, prompt, llm)
print("Ensambler 25/75")
print(result_ensemble4)
avg_result_ensemble4 = dictionary(result_ensemble4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

Ensambler 25/75
{'faithfulness': 0.9220, 'answer_relevancy': 0.5278, 'context_precision': 0.8992, 'context_recall': 0.7194, 'answer_similarity': 0.6978, 'answer_correctness': 0.8041}


NameError: name 'dictionary' is not defined

In [17]:
avg_result_ensemble4 = dictionary(result_ensemble4)

The average score is: 0.761723666299702


In [18]:
ensemble_retriever_5 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_mmr], weights=[0.5,0.5])
result_ensemble5 = run_and_evaluate(ensemble_retriever_5, prompt, llm)
print("Ensambler 50/50")
print(result_ensemble5)
avg_result_ensemble5 = dictionary(result_ensemble5)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.84it/s]


Ensambler 50/50
{'faithfulness': 0.9167, 'answer_relevancy': 0.7830, 'context_precision': 0.5544, 'context_recall': 0.8889, 'answer_similarity': 0.8966, 'answer_correctness': 0.5320}
The average score is: 0.7619267999961684


In [19]:
ensemble_retriever_6 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_mmr], weights=[0.75,0.25])
result_ensemble6 = run_and_evaluate(ensemble_retriever_6, prompt, llm)
print("Ensambler 75/25")
print(result_ensemble6)
avg_result_ensemble6 = dictionary(result_ensemble6)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.93it/s]


Ensambler 75/25
{'faithfulness': 0.8921, 'answer_relevancy': 0.7009, 'context_precision': 0.5021, 'context_recall': 0.8000, 'answer_similarity': 0.9053, 'answer_correctness': 0.5154}
The average score is: 0.7193036932050777


#### Multi-stage - reranker

In [20]:
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [22]:
retriever_context = retriever_mmr
compressor = CohereRerank(top_n=6)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever_context
)

result_compression = run_and_evaluate(compression_retriever, prompt, llm)
print("Reranker")
print(result_compression)
avg_result_compression = dictionary(result_compression)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  60%|██████    | 36/60 [00:06<00:02,  9.59it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.51it/s]


Reranker
{'faithfulness': 0.7556, 'answer_relevancy': 0.6267, 'context_precision': 0.6985, 'context_recall': 0.9000, 'answer_similarity': 0.9001, 'answer_correctness': 0.5243}
The average score is: 0.7341855512490597


#### creating context by remaking the query

In [24]:
template_context = "Generate a search query to fetch the relevant documents using the user's {question}. Craft a query that specifically targets the keywords in the question. In the answer provide only the query."
prompt_context = ChatPromptTemplate.from_template(template_context)

In [25]:
answers_final = []
contexts_final = []
llm_for_context =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm}
)
for query in questions:
    response_check = llm_for_context.invoke({"question": query})
    search_query = response_check["response"].content
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
)
    docs = retriever_mmr.get_relevant_documents(search_query)
    formatted_docs = []
    for doc in docs:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final.append(response["response"].content)
            contexts_final.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final.append("No answer")
            contexts_final.append(formatted_docs)


result_search_query = evaluation_rag(questions, answers_final, contexts_final, ground_truths)
print(result_search_query)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.66it/s]


{'faithfulness': 0.9055, 'answer_relevancy': 0.4621, 'context_precision': 0.6764, 'context_recall': 0.7257, 'answer_similarity': 0.8721, 'answer_correctness': 0.4000}


In [26]:
avg_result_search_query = dictionary(result_search_query)

The average score is: 0.6736276695602351


### change model to GPT-4

In [27]:
answers_final_gpt4 = []
contexts_final_gpt4 = []
result_mmr_gpt4 = run_and_evaluate(retriever_mmr, prompt, llm_gpt4)
print("Marginal relevance")
print(result_mmr_gpt4)
average_score_result_mmr_gpt4 = dictionary(result_mmr_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.08it/s]


Marginal relevance
{'faithfulness': 0.9042, 'answer_relevancy': 0.7370, 'context_precision': 0.6381, 'context_recall': 0.9500, 'answer_similarity': 0.8971, 'answer_correctness': 0.5400}
The average score is: 0.777724376964276
