In [17]:
import dotenv
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings
dotenv.load_dotenv()
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness,
)
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
import os
from langchain.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
import getpass
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever
import sys
sys.tracebacklimit = 0
import pandas as pd
import numpy as np

In [2]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL")
EMBEDDING_DEPLOYMENT = os.environ.get("EMBEDDING_DEPLOYMENT")
OPENAI_MODEL_GPT4 = os.environ.get("OPENAI_MODEL_GPT4")
OPENAI_DEPLOYMENT_GPT4 = os.environ.get("OPENAI_DEPLOYMENT_GPT4")

In [3]:
embeddings_client = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_version=OPENAI_API_VERSION)
llm = AzureChatOpenAI(model_name=OPENAI_MODEL, azure_deployment=OPENAI_DEPLOYMENT,temperature=0)
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [4]:
def evaluation_llm(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
    )
    return result

In [5]:
def evaluation_rag(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  # most likely
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

In [6]:
questions = [
    "What happened to Alexei Nivalny in February 2024?",
    "What were the initial findings of the National Transportation Safety Board (NTSB) regarding the door blowout incident on an Alaska Airlines Boeing 737 Max 9, and how did Boeing respond to the report?",
    "What is the current financial situation in Europe regarding support for Ukraine, and what potential challenges might arise in the next 12 months?",
    "What is Prince William's stance on the Israel-Gaza conflict, and what key points did he emphasize during his statement?",
    "What are the key details regarding the arrest of Daniela Klette, an alleged member of the Red Army Faction (RAF), after more than 30 years in hiding?",
    "How does the entertainment industry, particularly Hollywood, respond to the introduction of OpenAI's Sora text-to-video generator, and what concerns and actions have been expressed by industry professionals?",
    "What are the key reasons for the conflict between Israel and Hamas in Gaza?",
    "What are some key factors influencing the upcoming 2024 US presidential election, especially in terms of the candidates' strategies, key battleground states, and significant issues?",
    "How has Boeing responded to recent criticisms and safety concerns, and what steps has the company taken to address the identified issues in its safety culture and aircraft manufacturing processes?",
    "What recent actions and statements by Kim Jong Un have raised concerns about the possibility of North Korea engaging in military conflict, and what factors do experts highlight as potential obstacles to such a war?"
    ]

ground_truths = [
    ["Alexei Navalny died in his prison cell in Siberia on February 16. He was serving a 19-year sentence on charges widely seen as politically motivated. Navalny's colleague, Maria Pevchikh, claimed that he was about to be freed in a prisoner swap. The proposed exchange involved Navalny being exchanged for Vadim Krasikov, a Russian hitman serving a life sentence for murder in Germany. Two US citizens held in Russia were also reportedly part of the deal. However, Kremlin spokesperson Dmitry Peskov denied knowledge of such agreements. Maria Pevchikh stated that negotiations for the prisoner swap had been ongoing for two years and were in their final stage on February 15. Navalny's death occurred on February 16, with prison officials attributing it to him falling ill after a walk. Pevchikh claimed that Putin changed his mind about the deal at the last minute, and the Russian president 'could not tolerate Navalny being free.' The reasons behind Putin's potential agreement to the swap and subsequent change of mind remain unclear. Navalny's widow, Yulia, has accused Putin of killing her husband, and Western leaders have blamed Putin for Navalny's death. Investigations into Navalny's death were ongoing at the time of the provided text, and there were discussions of potential sanctions by the EU and the US on Russia following Navalny's death."],
    ["The National Transportation Safety Board (NTSB) conducted an investigation into the door blowout incident on an Alaska Airlines Boeing 737 Max 9. The initial findings revealed that four key bolts meant to secure the unused door to the fuselage were missing, leading to the door's detachment shortly after takeoff. Boeing accepted accountability for the incident, with its president, Dave Calhoun, acknowledging the need for improvement. Boeing outlined a comprehensive plan to strengthen quality and regain stakeholders' confidence, emphasizing increased inspections, documentation of door plug assembly, and additional scrutiny in the supply chain. Calhoun assured a commitment to transparency and collaboration with regulatory authorities in addressing safety concerns."],
    ["Europe is grappling with the question of how long it can sustain draining financial support for Ukraine, having spent over $100 billion since the crisis began. The war is deadlocked, and recent setbacks for Ukraine, coupled with delays in US financial aid approval, are causing concerns. As fatigue sets in and political pressure grows, there's uncertainty about continued funding, especially if the US pulls financial support. European officials are considering using frozen Russian assets for funding, but concerns about setting precedents and challenges in independently providing weapons make the situation complex. The next 12 months are crucial, with European elections approaching, and maintaining support for Ukraine is deemed essential, even if the US reduces its involvement."],
    ["Prince William expressed deep concern about the human cost of the conflict since the Hamas terrorist attack, called for an end to the fighting, emphasized the need for increased humanitarian support to Gaza, and urged the release of hostages. He highlighted the critical importance of aid delivery and acknowledged the severe humanitarian situation in the region. Additionally, Prince William emphasized the impact on civilians and expressed hope for a brighter future and permanent peace."],
    ["Daniela Klette, a 65-year-old alleged RAF member from the so-called third generation, was arrested in the Berlin district of Kreuzberg after three decades in hiding. She is accused of attempted murder and serious robberies, with charges dating back to the 1980s and 1990s. Klette was identified through fingerprints, did not resist arrest, and has been flown to Bremen for pre-trial detention. A second person, with unconfirmed identity, was also arrested. The arrest is considered a milestone in the fight against 'tterrorism,' with officials highlighting the persistence of pursuing criminals regardless of the time elapsed since their crimes."],
    ["The entertainment industry, including Hollywood, is cautiously reacting to OpenAI's Sora text-to-video generator. While some acknowledge the potential impact on job roles, others express concerns about the tool's current limitations, such as temporal consistency issues and artifacts. Filmmakers like Tyler Perry have taken action, delaying studio expansions due to AI advancements. Industry professionals are advocating for protections around the use of AI models, considering potential job redundancies. OpenAI, aware of these concerns, has expanded licensing agreements and expressed intentions to engage with policymakers, educators, and artists before making Sora publicly available. Concerns also include potential copyright issues and the generation of actors' likenesses without consent. The industry remains hopeful that it can adapt to AI innovations, emphasizing the irreplaceable role of the human creative mind."],
    ["The conflict between Israel and Hamas in Gaza stems from Hamas' desire to create an Islamic state, rejection of Israel's right to exist, and its justification for attacks based on perceived Israeli crimes against Palestinians. The recent escalation involves a series of Hamas attacks on Israel, leading to a significant military response and the ongoing hostage situation. The conflict has resulted in a high death toll, displacement of civilians, and severe humanitarian crises in Gaza."],
    ["The 2024 US presidential election is characterized by a unique rematch between Joe Biden and Donald Trump. Both candidates face challenges and vulnerabilities. Trump won the South Carolina Republican primary, strengthening his path to the GOP nomination. The election is expected to be decided in a few key states, such as Wisconsin, Pennsylvania, Michigan, Arizona, and Georgia. Economic factors, including positive indicators but lingering public concerns, play a role. The campaigns are likely to focus on issues like abortion and immigration, with Democrats emphasizing Trump's impact on abortion laws and Republicans highlighting immigration concerns. Other potential factors include crime rates, the environment, climate change, and foreign policy. The candidates' age, health, and the possibility of third-party candidates add uncertainty to the election landscape. Trump's legal challenges and the aftermath of the January 2021 Capitol attack could also influence public opinion."],
    ["Boeing has responded to recent criticisms and safety concerns by acknowledging accountability for incidents, such as the door blowout on an Alaska Airlines Boeing 737 Max 9. The company's president, Dave Calhoun, emphasized transparency and outlined a comprehensive plan to strengthen quality, including increased inspections, documentation of door plug assembly, and additional scrutiny in the supply chain. Boeing has also undergone organizational changes, with the departure of the head of the 737 program and the appointment of new leadership to enhance focus on safety and quality. Additionally, a recent FAA report highlighted major issues with Boeing's safety culture, and Boeing expressed support for the panel's review, acknowledging the need for continuous improvement in fostering a safety culture within the company."],
    ["Recent actions and statements by Kim Jong Un, including renouncing the goal of peaceful reunification with South Korea, intensifying nuclear threats, and mentioning South Korea as the 'principal enemy, have raised concerns about the potential for North Korea to initiate a military conflict. Experts point out factors such as the lack of support from China and Russia, North Korea's inferior conventional weapons, and doubts about Kim's willingness to risk his regime's stability as obstacles to a full-scale war. Despite increased tensions, some experts believe that provocations may be aimed at negotiation strategies or domestic stability rather than an imminent military attack."]
    ]

In [7]:
columns = ["System", "Faithfulness", "Answer Relevancy", "Context Precision", "Context Recall", "Answer Similarity", "Answer Correctness"]
results_df = pd.DataFrame(columns=columns)

In [8]:
max_average = 0
def find_highest(average_score):
    global max_average
    if average_score > max_average:
        max_average = average_score
        print("This is the new best value!")

In [9]:
def dictionary(result):
    dict_result = dict(result)
    average_score = sum(dict_result.values()) / len(dict_result)
    print(f"The average score is: {average_score}")
    find_highest(average_score)
    return average_score

In [10]:
def evaluate_system(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_rag(questions, answers, contexts, ground_truths)
    average = dictionary(result)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Faithfulness": result["faithfulness"],
        "Answer Relevancy": result["answer_relevancy"],
        "Context Precision": result["context_precision"],
        "Context Recall": result["context_recall"],
        "Answer Similarity": result["answer_similarity"],
        "Answer Correctness": result["answer_correctness"],
        "Average": average
    }
    df_system_results = pd.DataFrame([system_results])
    return df_system_results

In [11]:
def evaluate_LLM(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_rag(questions, answers, contexts, ground_truths)
    average = dictionary(result)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Faithfulness": result["faithfulness"],
        "Answer Relevancy": result["answer_relevancy"],
        "Context Precision": np.nan,
        "Context Recall": np.nan,
        "Answer Similarity": result["answer_similarity"],
        "Answer Correctness": result["answer_correctness"],
        "Average": average
    }
    df_llm_results = pd.DataFrame([system_results])
    return df_llm_results

### General answer by LLMs

In [12]:
template = """{question}"""
prompt = ChatPromptTemplate.from_template(template)

In [13]:
llm_chain =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm}
)
llm_chain_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm_gpt4}
)

In [14]:
answers_llm = []
contexts_llm = [[""],[""],[""],[""],[""],[""],[""],[""],[""],[""]]

In [15]:
for query in questions:
    response = llm_chain.invoke({"question": query})
    answers_llm.append(response["response"].content)

In [18]:
llm_results = evaluate_LLM("GPT-3.5", questions, answers_llm, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results], ignore_index=True)
print(llm_results)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.86it/s]


The average score is: 0.6398844441439843
    System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  GPT-3.5       0.94709          0.463838                NaN             NaN   

   Answer Similarity  Answer Correctness   Average  
0           0.900371            0.468007  0.639884  


  results_df = pd.concat([results_df, llm_results], ignore_index=True)


In [19]:
answers_llm_gpt4 = []
for query in questions:
    response = llm_chain_gpt4.invoke({"question": query})
    answers_llm_gpt4.append(response["response"].content)
llm_results_gpt4 = evaluate_LLM("GPT-4",questions, answers_llm_gpt4, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results_gpt4], ignore_index=True)
print(llm_results_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.03it/s]


The average score is: 0.6323262879939882
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  GPT-4           1.0          0.287687                NaN             NaN   

   Answer Similarity  Answer Correctness   Average  
0           0.909879            0.536392  0.632326  


In [20]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.94709,0.463838,,,0.900371,0.468007,0.639884
1,GPT-4,1.0,0.287687,,,0.909879,0.536392,0.632326


# RAG

### Naive RAG

In [21]:
def retrieval_chain(prompt, retriever, llm):
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
    )
    return retrieval_augmented_qa_chain

In [22]:
loader = DirectoryLoader('../news', glob="./*.txt", loader_cls=TextLoader)
documents = loader.load()

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = CharacterTextSplitter()
chunks = text_splitter.split_documents(documents)
db_naive = Chroma.from_documents(chunks, embeddings_client, persist_directory = "../news/vectordb/naive")
db_naive.persist()
retriever_naive = db_naive.as_retriever()

In [23]:
db_naive = Chroma(persist_directory = "../news/vectordb/naive", embedding_function=embeddings_client)
retriever_naive = db_naive.as_retriever()

In [24]:
template = """{question}. 
Provided context {context}."""
prompt = ChatPromptTemplate.from_template(template)

In [25]:
answers_naive = []
contexts_naive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_naive, llm).invoke({"question": query})
        # Access the response content
        answers_naive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_naive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_naive.append("No answer")
        context_full = retriever_naive.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_naive.append(context_content)

In [26]:
result_naive_rag = evaluate_system("Naive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_naive_rag], ignore_index=True)
print(result_naive_rag)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.97it/s]


The average score is: 0.9304607912730924
This is the new best value!
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  Naive      0.966518           0.91753           0.983333           0.975   

   Answer Similarity  Answer Correctness   Average  
0           0.963399            0.776985  0.930461  


### recursive splitter

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
chunks_r = text_splitter.split_documents(documents)
db_basic = Chroma.from_documents(chunks_r, embeddings_client, persist_directory = "../news/vectordb/recursive_basic")
db_basic.persist()
retriever_basic = db_basic.as_retriever()

In [27]:
db_basic = Chroma(persist_directory = "../news/vectordb/recursive_basic", embedding_function=embeddings_client)
retriever_basic = db_basic.as_retriever()

In [28]:
answers_recursive = []
contexts_recursive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_basic, llm).invoke({"question": query})
        # Access the response content
        answers_recursive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_recursive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_recursive.append("No answer")
        context_full = retriever_basic.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_recursive.append(context_content)

In [29]:
result_recursive = evaluate_system("Recursive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_recursive], ignore_index=True)
result_recursive

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.84it/s]


The average score is: 0.8973359714366723


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,Recursive,0.966518,0.737454,0.983333,0.96,0.963403,0.773308,0.897336


### chunk sizes

In [30]:
def run_and_evaluate(name, retriever, prompt, llm, results_df):
    answers = []
    contexts_extra = []

    for query in questions:
        try:  
            response = retrieval_chain(prompt, retriever, llm).invoke({"question": query})
            # Access the response content
            answers.append(response["response"].content)
            # Access the context content
            context_content = [context.page_content for context in response["context"]]
            contexts_extra.append(context_content)  
        except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers.append("No answer")
            context_full = retriever.get_relevant_documents(query)
            context_content = [context.page_content for context in context_full]
            contexts_extra.append(context_content)

    result = evaluate_system(name, questions, answers, contexts_extra, ground_truths)
    results_df = pd.concat([results_df, result], ignore_index=True)
    return result, results_df

In [32]:
chunk_sizes = [500, 1000, 2000, 3000]
overlap_percentages = [0, 5, 10, 15, 20]

for chunk_size in chunk_sizes:
    for overlap_percentage in overlap_percentages:
        # Calculate overlap based on percentage
        chunk_overlap = int(chunk_size * overlap_percentage / 100)
        
        # Create text splitter
        # text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        
        # # Split documents
        # chunks = text_splitter.split_documents(documents)
        
        # Print number of chunks
        print(f"Chunk size {chunk_size}, overlap {overlap_percentage}%")
        
        # Create Chroma database
        # db = Chroma.from_documents(chunks, embeddings_client, persist_directory=f"../news/vectordb-edit/chunking_{chunk_size}_{overlap_percentage}")
        db = Chroma(persist_directory = f"../news/vectordb-edit/chunking_{chunk_size}_{overlap_percentage}", embedding_function=embeddings_client)
        db.persist()
        
        # Create retriever
        retriever = db.as_retriever()
        
        # Run and evaluate
        result,results_df = run_and_evaluate(f"Chunk {chunk_size}, overlap {overlap_percentage}%", retriever, prompt, llm, results_df)
        print(f"CHUNK SIZE {chunk_size}, {overlap_percentage}% overlap:")
        print(result)

Chunk size 500, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.80it/s]


The average score is: 0.9133206697764482
CHUNK SIZE 500, 0% overlap:
                  System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 0%      0.983333          0.925543           0.991667   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.946429           0.961035            0.671918  0.913321  
Chunk size 500, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.84it/s]


The average score is: 0.9119826794772815
CHUNK SIZE 500, 5% overlap:
                  System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 5%      0.974603          0.932008           0.991667   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.902778           0.960365            0.710475  0.911983  
Chunk size 500, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.74it/s]


The average score is: 0.8868214139392477
CHUNK SIZE 500, 10% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 10%      0.943386          0.807349               0.95   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.943333           0.962567            0.714292  0.886821  
Chunk size 500, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.43it/s]


The average score is: 0.9086637065490365
CHUNK SIZE 500, 15% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 15%      0.982143          0.898565           0.963889   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.977778            0.96166            0.667948  0.908664  
Chunk size 500, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  7.14it/s]


The average score is: 0.9072624385789081
CHUNK SIZE 500, 20% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 20%      0.954861          0.912232           0.955556   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.977778            0.96126            0.681888  0.907262  
Chunk size 1000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-662' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-663' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-664' coro=<AsyncClient.aclose() done, defined

The average score is: 0.9250647005919422
CHUNK SIZE 1000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 0%      0.987654          0.907493           0.983333   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0           0.955           0.959331            0.757576  0.925065  
Chunk size 1000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.41it/s]


The average score is: 0.9118793657816106
CHUNK SIZE 1000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 5%      0.984127          0.891685              0.975   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.94           0.961574             0.71889  0.911879  
Chunk size 1000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.40it/s]


The average score is: 0.9093015956435017
CHUNK SIZE 1000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 10%      0.987654          0.889549           0.983333   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.94           0.961549            0.693724  0.909302  
Chunk size 1000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.64it/s]


The average score is: 0.9209129249333007
CHUNK SIZE 1000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 15%      0.987654          0.900701           0.983333   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0           0.955           0.962723            0.736066  0.920913  
Chunk size 1000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.39it/s]


The average score is: 0.9116297340988951
CHUNK SIZE 1000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 20%      0.953086          0.937404           0.983333   

   Context Recall  Answer Similarity  Answer Correctness  Average  
0            0.94           0.962642            0.693313  0.91163  
Chunk size 2000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-976' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-977' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-978' coro=<AsyncClient.aclose() done, defined

The average score is: 0.9136525045990239
CHUNK SIZE 2000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 0%      0.949735          0.867421              0.975   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             1.0           0.963794            0.725964  0.913653  
Chunk size 2000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.89it/s]


The average score is: 0.9162885287561159
CHUNK SIZE 2000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 5%      0.968254          0.876055           0.966667   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.971429           0.963941            0.751386  0.916289  
Chunk size 2000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.70it/s]


The average score is: 0.8974300761898079
CHUNK SIZE 2000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 10%       0.96875          0.769901              0.975   

   Context Recall  Answer Similarity  Answer Correctness  Average  
0             1.0           0.964368            0.706562  0.89743  
Chunk size 2000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.16it/s]


The average score is: 0.8890887209203086
CHUNK SIZE 2000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 15%      0.943452          0.781496           0.966667   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.983333           0.964392            0.695192  0.889089  
Chunk size 2000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.97it/s]


The average score is: 0.9138253337038399
CHUNK SIZE 2000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 20%      0.964286           0.86897           0.966667   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             1.0           0.963397            0.719633  0.913825  
Chunk size 3000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-1296' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-1297' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-1298' coro=<AsyncClient.aclose() done, defi

The average score is: 0.9094172273979401
CHUNK SIZE 3000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 0%      0.952381          0.853019              0.975   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.98           0.963986            0.732118  0.909417  
Chunk size 3000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.18it/s]


The average score is: 0.9062728701136448
CHUNK SIZE 3000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 5%       0.96875          0.834882              0.975   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.98           0.963353            0.715653  0.906273  
Chunk size 3000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.49it/s]


The average score is: 0.9063735498045288
CHUNK SIZE 3000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 10%      0.952381          0.829536           0.958333   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.98           0.965752            0.752239  0.906374  
Chunk size 3000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.83it/s]


The average score is: 0.8895145277177904
CHUNK SIZE 3000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 15%      0.873469          0.843047              0.975   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.98           0.963603            0.701967  0.889515  
Chunk size 3000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.47it/s]


The average score is: 0.9057416037974716
CHUNK SIZE 3000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 20%      0.951389          0.856902              0.975   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.98           0.965436            0.705722  0.905742  


In [33]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.94709,0.463838,,,0.900371,0.468007,0.639884
1,GPT-4,1.0,0.287687,,,0.909879,0.536392,0.632326
2,Naive,0.966518,0.91753,0.983333,0.975,0.963399,0.776985,0.930461
3,Recursive,0.966518,0.737454,0.983333,0.96,0.963403,0.773308,0.897336
4,"Chunk 500, overlap 0%",0.983333,0.925543,0.991667,0.946429,0.961035,0.671918,0.913321
5,"Chunk 500, overlap 5%",0.974603,0.932008,0.991667,0.902778,0.960365,0.710475,0.911983
6,"Chunk 500, overlap 10%",0.943386,0.807349,0.95,0.943333,0.962567,0.714292,0.886821
7,"Chunk 500, overlap 15%",0.982143,0.898565,0.963889,0.977778,0.96166,0.667948,0.908664
8,"Chunk 500, overlap 20%",0.954861,0.912232,0.955556,0.977778,0.96126,0.681888,0.907262
9,"Chunk 1000, overlap 0%",0.987654,0.907493,0.983333,0.955,0.959331,0.757576,0.925065


In [34]:
results_df.to_csv(f"../news/results/results_summary.csv", index=False)

In [36]:
highest_average = results_df["Average"].max()
highest_average

0.9304607912730924

chunk = 1000

In [None]:
# # THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
# chunks_1000 = text_splitter.split_documents(documents)
# print(len(chunks_1000))
# db_1000 = Chroma.from_documents(chunks_1000, embeddings_client, persist_directory = "../news/vectordb/recursive_1000")
# db_1000.persist()

In [27]:
# db_1000 = Chroma(persist_directory = "../news/vectordb/recursive_1000", embedding_function=embeddings_client)

In [28]:
# retriever_1000 = db_1000.as_retriever()
# result_1000 = run_and_evaluate(retriever_1000, prompt, llm)
# print("CHUNK SIZE 1000")
# print(result_1000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.18it/s]


CHUNK SIZE 1000
{'faithfulness': 0.9841, 'answer_relevancy': 0.8044, 'context_precision': 0.9750, 'context_recall': 0.9550, 'answer_similarity': 0.9614, 'answer_correctness': 0.6946}


chunk = 500

In [33]:
# # THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
# chunks_500 = text_splitter.split_documents(documents)
# print(len(chunks_500))
# db_500 = Chroma.from_documents(chunks_500, embeddings_client, persist_directory = "../news/vectordb/recursive_500")
# db_500.persist()

107


In [None]:
# db_500 = Chroma(persist_directory = "../news/vectordb/recursive_500", embedding_function=embeddings_client)

In [34]:
# retriever_500 = db_500.as_retriever()
# result_500 = run_and_evaluate(retriever_500, prompt, llm)
# print("CHUNK SIZE 500")
# print(result_500)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.89it/s]


CHUNK SIZE 500
{'faithfulness': 0.9378, 'answer_relevancy': 0.7268, 'context_precision': 0.9500, 'context_recall': 0.9472, 'answer_similarity': 0.9616, 'answer_correctness': 0.7098}


chunk = 2000

In [35]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 200)
# chunks_2000 = text_splitter.split_documents(documents)
# print(len(chunks_2000))
# db_2000 = Chroma.from_documents(chunks_2000, embeddings_client, persist_directory = "../news/vectordb/recursive_2000")


37


In [8]:
# db_2000 = Chroma(persist_directory = "../news/vectordb/recursive_2000", embedding_function=embeddings_client)

In [37]:
# retriever_2000 = db_2000.as_retriever()
# result_2000 = run_and_evaluate(retriever_2000, prompt, llm)
# print("CHUNK SIZE 2000")
# print(result_2000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.28it/s]


CHUNK SIZE 2000
{'faithfulness': 0.9435, 'answer_relevancy': 0.8731, 'context_precision': 0.9750, 'context_recall': 1.0000, 'answer_similarity': 0.9636, 'answer_correctness': 0.7390}


chunk = 3000

In [38]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 3000, chunk_overlap = 300)
# chunks_3000 = text_splitter.split_documents(documents)
# print(len(chunks_3000))
# db_3000 = Chroma.from_documents(chunks_3000, embeddings_client, persist_directory = "../news/vectordb/recursive_3000")

32


In [None]:
# db_3000 = Chroma(persist_directory = "../news/vectordb/recursive_3000", embedding_function=embeddings_client)

In [39]:
# retriever_3000 = db_3000.as_retriever()
# result_3000 = run_and_evaluate(retriever_3000, prompt, llm)
# print("CHUNK SIZE 3000")
# print(result_3000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.30it/s]


CHUNK SIZE 3000
{'faithfulness': 0.9592, 'answer_relevancy': 0.7520, 'context_precision': 0.9583, 'context_recall': 0.9800, 'answer_similarity': 0.9645, 'answer_correctness': 0.7113}


### now time to look for different top-k

Note: We continue with the size chunk of 2000 as it had the highest average score

In [37]:
k_values = [2, 3, 5]

# Iterate over different k values
for k in k_values:
    # Create retriever with k value
    retriever = db_naive.as_retriever(search_kwargs={"k": k})
    
    # Run and evaluate
    result,results_df = run_and_evaluate(f"Chunk size {chunk_size}, overlap {overlap_percentage}%, K={k}", retriever, prompt, llm, results_df)
    print(f"Results for K={k}:")
    print(result)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.93it/s]


The average score is: 0.8950808134091349
Results for K=2:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 3000, overlap 20%, K=2      0.965608          0.814959   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0                1.0        0.857778           0.961771            0.770368   

    Average  
0  0.895081  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.09it/s]


The average score is: 0.9012849460805582
Results for K=3:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 3000, overlap 20%, K=3      0.945833          0.906465   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.983333        0.877222           0.963646            0.731209   

    Average  
0  0.901285  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.93it/s]


The average score is: 0.8511285114372145
Results for K=5:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 3000, overlap 20%, K=5      0.864198          0.705882   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.969306        0.971429           0.957138             0.63882   

    Average  
0  0.851129  


In [38]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.94709,0.463838,,,0.900371,0.468007,0.639884
1,GPT-4,1.0,0.287687,,,0.909879,0.536392,0.632326
2,Naive,0.966518,0.91753,0.983333,0.975,0.963399,0.776985,0.930461
3,Recursive,0.966518,0.737454,0.983333,0.96,0.963403,0.773308,0.897336
4,"Chunk 500, overlap 0%",0.983333,0.925543,0.991667,0.946429,0.961035,0.671918,0.913321
5,"Chunk 500, overlap 5%",0.974603,0.932008,0.991667,0.902778,0.960365,0.710475,0.911983
6,"Chunk 500, overlap 10%",0.943386,0.807349,0.95,0.943333,0.962567,0.714292,0.886821
7,"Chunk 500, overlap 15%",0.982143,0.898565,0.963889,0.977778,0.96166,0.667948,0.908664
8,"Chunk 500, overlap 20%",0.954861,0.912232,0.955556,0.977778,0.96126,0.681888,0.907262
9,"Chunk 1000, overlap 0%",0.987654,0.907493,0.983333,0.955,0.959331,0.757576,0.925065


In [14]:
# retriever_2 = db_2000.as_retriever(search_kwargs={"k": 2})
# result_2 = run_and_evaluate(retriever_2, prompt, llm)
# print("CHUNK SIZE 1000, K=2")
# print(result_2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.54it/s]


CHUNK SIZE 1000, K=2
{'faithfulness': 0.9306, 'answer_relevancy': 0.9134, 'context_precision': 1.0000, 'context_recall': 0.9289, 'answer_similarity': 0.9617, 'answer_correctness': 0.7467}


In [15]:
# retriever_3 = db_2000.as_retriever(search_kwargs={"k": 3})
# result_3 = run_and_evaluate(retriever_3, prompt, llm)
# print("CHUNK SIZE 1000, K=3")
# print(result_3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.10it/s]


CHUNK SIZE 1000, K=3
{'faithfulness': 0.9260, 'answer_relevancy': 0.8950, 'context_precision': 0.9833, 'context_recall': 0.9750, 'answer_similarity': 0.9641, 'answer_correctness': 0.7973}


In [16]:
# retriever_5 = db_2000.as_retriever(search_kwargs={"k": 5})
# result_5 = run_and_evaluate(retriever_5, prompt, llm)
# print("CHUNK SIZE 1000, K=5")
# print(result_5)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.76it/s]


CHUNK SIZE 1000, K=5
{'faithfulness': 0.9688, 'answer_relevancy': 0.7442, 'context_precision': 0.9617, 'context_recall': 0.9714, 'answer_similarity': 0.9619, 'answer_correctness': 0.7390}


In [17]:
# retriever_6= db_2000.as_retriever(search_kwargs={"k": 6})
# result_6 = run_and_evaluate(retriever_6, prompt, llm)
# print("CHUNK SIZE 1000, K=5")
# print(result_6)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  63%|██████▎   | 38/60 [00:06<00:02,  9.86it/s]Invalid JSON response. Expected dictionary with key 'Attributed'
Evaluating:  97%|█████████▋| 58/60 [00:13<00:01,  1.38it/s]Runner in Executor raised an exception
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 58, in _aresults
    r = await future
        ^^^^^^^^^^^^
  File "C:\Users\sigitalapina\AppData\Local\Programs\Python\Python311\Lib\asyncio\tasks.py", line 605, in _wait_for_one
    return f.result()  # May raise f.exception().
           ^^^^^^^^^^
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 91, in wrapped_callable_async
    retu

CHUNK SIZE 1000, K=5
{'faithfulness': 0.7835, 'answer_relevancy': 0.8474, 'context_precision': 0.8161, 'context_recall': 0.9210, 'answer_similarity': 0.9402, 'answer_correctness': 0.8747}


In [23]:
# dict_res6 = dict(result_6)
# average_score = sum(dict_res6.values()) / len(dict_res6)
# print(f"The average score is: {average_score}")

The average score is: 0.8638028044818998


### look for different retrievers

3 chunks was the best score

#### parent document retriever

In [39]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

parent_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap = 200)
child_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents",persist_directory = "../vectordb-edit/parent_summarize", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
parent_document_retriever.add_documents(documents)
result_parent, results_df = run_and_evaluate(f"Parent Retriever 1000-200", parent_document_retriever, prompt, llm, results_df)
print(result_parent)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.20it/s]


The average score is: 0.897963075414468
                      System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 1000-200      0.973765          0.821909   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0                1.0        0.936111           0.961714            0.694279   

    Average  
0  0.897963  


In [40]:
parent_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap = 50)
child_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_small",persist_directory = "../vectordb-edit/parent_small_summarize", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_small = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_small,
    parent_splitter=parent_splitter_small,
)
parent_document_retriever_small.add_documents(documents)
result_parent_small, results_df = run_and_evaluate(f"Parent Retriever 500-100", parent_document_retriever_small, prompt, llm, results_df)
print(result_parent_small)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.13it/s]


The average score is: 0.914825196996499
                     System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 500-100      0.988889          0.912322   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.941667        0.952778           0.962236            0.731059   

    Average  
0  0.914825  


In [41]:
parent_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1500, chunk_overlap = 150)
child_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_large",persist_directory = "../vectordb-edit/parent_large_summarize", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_large = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_large,
    parent_splitter=parent_splitter_large,
)
parent_document_retriever_large.add_documents(documents)
result_parent_large , results_df = run_and_evaluate(f"Parent Retriever 1500-200", parent_document_retriever_small, prompt, llm, results_df)
print(result_parent_large)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.31it/s]


The average score is: 0.8627504709173657
                      System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 1500-200        0.9875          0.643101   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.941667        0.952778           0.962287            0.689171   

   Average  
0  0.86275  


In [42]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.94709,0.463838,,,0.900371,0.468007,0.639884
1,GPT-4,1.0,0.287687,,,0.909879,0.536392,0.632326
2,Naive,0.966518,0.91753,0.983333,0.975,0.963399,0.776985,0.930461
3,Recursive,0.966518,0.737454,0.983333,0.96,0.963403,0.773308,0.897336
4,"Chunk 500, overlap 0%",0.983333,0.925543,0.991667,0.946429,0.961035,0.671918,0.913321
5,"Chunk 500, overlap 5%",0.974603,0.932008,0.991667,0.902778,0.960365,0.710475,0.911983
6,"Chunk 500, overlap 10%",0.943386,0.807349,0.95,0.943333,0.962567,0.714292,0.886821
7,"Chunk 500, overlap 15%",0.982143,0.898565,0.963889,0.977778,0.96166,0.667948,0.908664
8,"Chunk 500, overlap 20%",0.954861,0.912232,0.955556,0.977778,0.96126,0.681888,0.907262
9,"Chunk 1000, overlap 0%",0.987654,0.907493,0.983333,0.955,0.959331,0.757576,0.925065


In [43]:
highest_average = results_df["Average"].max()
highest_average

0.9304607912730924

#### Maximum marginal relevance retrieval

In [45]:
retriever_mmr = db_naive.as_retriever(search_type="mmr")
result_mmr, results_df = run_and_evaluate(f"MMR", retriever_mmr, prompt, llm, results_df)
print("Marginal relevance")
print(result_mmr)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  18%|█▊        | 11/60 [00:05<00:21,  2.25it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.99it/s]


The average score is: 0.8414345596056197
Marginal relevance
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0    MMR      0.887066          0.616988           0.978668        0.783352   

   Answer Similarity  Answer Correctness   Average  
0            0.93808            0.844453  0.841435  


#### BM25

In [46]:
text_splitter = CharacterTextSplitter()
chunks = text_splitter.split_documents(documents)

Created a chunk of size 4589, which is longer than the specified 4000
Created a chunk of size 4082, which is longer than the specified 4000


In [48]:
retriever_bm25 = BM25Retriever.from_documents(chunks)
result_bm25, results_df = run_and_evaluate(f"BM25", retriever_bm25, prompt, llm, results_df)
print("BM25")
print(result_bm25)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  7.02it/s]


The average score is: 0.9089806950408187
BM25
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0   BM25      0.981481          0.896899               0.95        0.958333   

   Answer Similarity  Answer Correctness   Average  
0           0.959879            0.707291  0.908981  


#### Ensambler - Hybrid

In [49]:
ensemble_retriever_1 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_naive], weights=[0.75, 0.25])
result_ensemble1, results_df = run_and_evaluate(f"Ensambler 1", ensemble_retriever_1, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble1)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.43it/s]


The average score is: 0.8964416058855433
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 1      0.943603          0.875975              0.945   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0           0.935           0.962835            0.716237  0.896442  


In [50]:
ensemble_retriever_2 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_naive], weights=[0.5, 0.5])
result_ensemble2, results_df = run_and_evaluate(f"Ensambler 2", ensemble_retriever_2, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  22%|██▏       | 13/60 [00:05<00:19,  2.47it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  33%|███▎      | 20/60 [00:05<00:08,  4.66it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.21it/s]


The average score is: 0.8646735668077402
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 2      0.798848          0.843394           0.791111   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.961667           0.958949            0.834073  0.864674  


In [51]:
ensemble_retriever_3 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_naive], weights=[0.25,0.75])
result_ensemble3, results_df = run_and_evaluate(f"Ensambler 3", ensemble_retriever_3, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.95it/s]


The average score is: 0.8706332328374514
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 3      0.957386          0.690303           0.980556   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.946429           0.955261            0.693865  0.870633  


#### Multi-stage - reranker

In [42]:
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [45]:
retriever_context = retriever_2000
compressor = CohereRerank(top_n = 3)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever_context
)

result_compression = run_and_evaluate(compression_retriever, prompt, llm)
print("Reranker")
print(result_compression)
avg_result_compression = dictionary(result_compression)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.99it/s]


Reranker
{'faithfulness': 0.8968, 'answer_relevancy': 0.8880, 'context_precision': 0.9417, 'context_recall': 0.9750, 'answer_similarity': 0.9576, 'answer_correctness': 0.6623}
The average score is: 0.8868849276357208


#### creating context by remaking the query

In [52]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.94709,0.463838,,,0.900371,0.468007,0.639884
1,GPT-4,1.0,0.287687,,,0.909879,0.536392,0.632326
2,Naive,0.966518,0.91753,0.983333,0.975,0.963399,0.776985,0.930461
3,Recursive,0.966518,0.737454,0.983333,0.96,0.963403,0.773308,0.897336
4,"Chunk 500, overlap 0%",0.983333,0.925543,0.991667,0.946429,0.961035,0.671918,0.913321
5,"Chunk 500, overlap 5%",0.974603,0.932008,0.991667,0.902778,0.960365,0.710475,0.911983
6,"Chunk 500, overlap 10%",0.943386,0.807349,0.95,0.943333,0.962567,0.714292,0.886821
7,"Chunk 500, overlap 15%",0.982143,0.898565,0.963889,0.977778,0.96166,0.667948,0.908664
8,"Chunk 500, overlap 20%",0.954861,0.912232,0.955556,0.977778,0.96126,0.681888,0.907262
9,"Chunk 1000, overlap 0%",0.987654,0.907493,0.983333,0.955,0.959331,0.757576,0.925065


In [53]:
template_context = "Generate a search query to fetch the relevant documents using the user's {question}. Craft a query that specifically targets the keywords in the question. In the answer provide only the query."
prompt_context = ChatPromptTemplate.from_template(template_context)

In [54]:
answers_final = []
contexts_final = []
# retriever_context_q = EnsembleRetriever(retrievers=[retriever_bm25, retriever_3], weights=[0.5, 0.5])
llm_for_context =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm}
)
for query in questions:
    response_check = llm_for_context.invoke({"question": query})
    search_query = response_check["response"].content
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
)
    docs = retriever_naive.get_relevant_documents(search_query)
    formatted_docs = []
    for doc in docs:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final.append(response["response"].content)
            contexts_final.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final.append("No answer")
            contexts_final.append(formatted_docs)


result_search_query = evaluation_rag(questions, answers_final, contexts_final, ground_truths)
result_search_query

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.66it/s]


{'faithfulness': 0.9467, 'answer_relevancy': 0.9207, 'context_precision': 0.9833, 'context_recall': 0.9550, 'answer_similarity': 0.9637, 'answer_correctness': 0.7499}

In [55]:
average = dictionary(result_search_query)
    # Create a dictionary to store the results
system_results = {
        "System": "Search query",
        "Faithfulness": result_search_query["faithfulness"],
        "Answer Relevancy": result_search_query["answer_relevancy"],
        "Context Precision": result_search_query["context_precision"],
        "Context Recall": result_search_query["context_recall"],
        "Answer Similarity": result_search_query["answer_similarity"],
        "Answer Correctness": result_search_query["answer_correctness"],
        "Average": average
    }
df_result_search_query = pd.DataFrame([system_results])

The average score is: 0.9198853575339109


In [56]:
results_df = pd.concat([results_df, df_result_search_query], ignore_index=True)
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.94709,0.463838,,,0.900371,0.468007,0.639884
1,GPT-4,1.0,0.287687,,,0.909879,0.536392,0.632326
2,Naive,0.966518,0.91753,0.983333,0.975,0.963399,0.776985,0.930461
3,Recursive,0.966518,0.737454,0.983333,0.96,0.963403,0.773308,0.897336
4,"Chunk 500, overlap 0%",0.983333,0.925543,0.991667,0.946429,0.961035,0.671918,0.913321
5,"Chunk 500, overlap 5%",0.974603,0.932008,0.991667,0.902778,0.960365,0.710475,0.911983
6,"Chunk 500, overlap 10%",0.943386,0.807349,0.95,0.943333,0.962567,0.714292,0.886821
7,"Chunk 500, overlap 15%",0.982143,0.898565,0.963889,0.977778,0.96166,0.667948,0.908664
8,"Chunk 500, overlap 20%",0.954861,0.912232,0.955556,0.977778,0.96126,0.681888,0.907262
9,"Chunk 1000, overlap 0%",0.987654,0.907493,0.983333,0.955,0.959331,0.757576,0.925065


### change model to GPT-4

In [57]:
top_3_highest = results_df.nlargest(3, "Average")
top_3_highest

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
2,Naive,0.966518,0.91753,0.983333,0.975,0.963399,0.776985,0.930461
9,"Chunk 1000, overlap 0%",0.987654,0.907493,0.983333,0.955,0.959331,0.757576,0.925065
12,"Chunk 1000, overlap 15%",0.987654,0.900701,0.983333,0.955,0.962723,0.736066,0.920913


In [58]:
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [59]:
result_gpt4_naive, results_df = run_and_evaluate(f"Naive retriever, GPT-4", retriever_naive, prompt, llm_gpt4, results_df)
result_gpt4_naive

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.81it/s]


The average score is: 0.9256730362654243


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Naive retriever, GPT-4",1.0,0.915748,0.991667,0.975,0.961886,0.709738,0.925673


In [60]:
db_0 = Chroma(persist_directory = "../news/vectordb-edit/chunking_1000_0", embedding_function=embeddings_client)
ret_0 = db_0.as_retriever()
result_1000_0_gpt4, results_df = run_and_evaluate(f"Chunk 1000, overlap 0%, GPT-4", ret_0, prompt, llm_gpt4, results_df)
result_1000_0_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.18it/s]


The average score is: 0.9043818126079827


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk 1000, overlap 0%, GPT-4",0.9875,0.814894,0.975,0.955,0.961471,0.732427,0.904382


In [61]:
db_15 = Chroma(persist_directory = "../news/vectordb-edit/chunking_1000_15", embedding_function=embeddings_client)
ret_15 = db_15.as_retriever()
result_1000_15_gpt4, results_df = run_and_evaluate(f"Chunk 1000, overlap 15%, GPT-4", ret_15, prompt, llm_gpt4, results_df)
result_1000_15_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.62it/s]


The average score is: 0.9038339747221363


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk 1000, overlap 15%, GPT-4",0.970833,0.882859,0.983333,0.94,0.964804,0.681174,0.903834


In [62]:
results_df.to_csv(f"../news/results/results_summary.csv", index=False)