In [2]:
import dotenv
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings
dotenv.load_dotenv()
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness,
)
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
import os
from langchain.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
import getpass
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever
import sys
sys.tracebacklimit = 0
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL")
EMBEDDING_DEPLOYMENT = os.environ.get("EMBEDDING_DEPLOYMENT")
OPENAI_MODEL_GPT4 = os.environ.get("OPENAI_MODEL_GPT4")
OPENAI_DEPLOYMENT_GPT4 = os.environ.get("OPENAI_DEPLOYMENT_GPT4")

In [4]:
embeddings_client = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_version=OPENAI_API_VERSION)
llm = AzureChatOpenAI(model_name=OPENAI_MODEL, azure_deployment=OPENAI_DEPLOYMENT,temperature=0)
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [5]:
def evaluation_llm(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
    )
    return result

In [6]:
def evaluation_rag(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  # most likely
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

In [7]:
questions = [
    "Finish writing the following short report: On February 24, 2024, former President Donald Trump secured a significant victory in the South Carolina Republican primary, defeating Nikki Haley in her home state. This report aims to analyze the key factors leading to Trump's success, the impact on the ongoing presidential race, and potential challenges for other candidates.",
    "Finish writing the following short report: Amidst the mysterious death of Russian opposition leader Alexei Navalny on February 16, 2024, this report investigates the claims made by his colleague, Maria Pevchikh, suggesting that Navalny was on the brink of being freed in a prisoner swap. The alleged plan involved exchanging Navalny for Russian hitman Vadim Krasikov, serving a life sentence in Germany, and two US citizens held in Russia. The report delves into the events leading up to Navalny's demise, the proposed prisoner swap, and the geopolitical implications surrounding this tragic incident.",
    "Please provide a concise report on the Israel-Hamas conflict, focusing on the recent Israeli special forces operation at Nasser Hospital in Khan Younis, Gaza. Include information on the reasons for the operation, international reactions, and the current state of the conflict.",
    "Finish writing the following report: In a recent incident involving an Alaska Airlines Boeing 737 Max 9, a door blowout occurred shortly after take-off, prompting the grounding of 171 planes of the same model in the United States. Boeing's president and CEO, Dave Calhoun, publicly admitted the company's mistake, acknowledging that a missing 27kg door 'plug' was not properly secured.",
    "Finish writing the following short report: Elon Musk's desire for increased control over Tesla has sparked discussions about his stake and concerns regarding potential takeovers.",
    "Finish writing the following short report: Hollywood remains unfazed by OpenAI's Sora, the latest text-to-video generator, despite its ability to quickly produce high-quality videos from simple prompts.",
    "Finish writing the following short report: Prince William's unexpected withdrawal from the memorial service for the late former King Constantine of Greece.",
    "Finish writing the following short report: As Ukraine commemorates the second anniversary of Russia's invasion, Europe confronts pressing questions about sustaining financial support for the war-torn nation."
]

ground_truths = [
    ["Donald Trump won the South Carolina Republican primary on February 24, 2024. The victory solidifies Trump's path towards securing the GOP nomination for the third consecutive time. Nikki Haley, Trump's former representative to the UN and former South Carolina governor, faced challenges despite holding more campaign events. South Carolina's first-in-the-South primary has historically been a reliable indicator of the eventual Republican nominee. Trump has won all significant contests for Republican delegates, including Iowa, New Hampshire, Nevada, and the US Virgin Islands. Haley, despite vowing to continue running, faces pressure to reconsider her candidacy after Trump's South Carolina victory. Trump's strength endures despite facing 91 criminal charges related to efforts to overturn the 2020 election and other controversies. Trump's first criminal trial is scheduled for March 25, 2024, in New York. The 2024 presidential race is increasingly shaping up as a rematch between Trump and President Joe Biden. Biden, facing criticism for supporting Israel in the Gaza War, is preparing for a tough electoral battle against Trump.Trump's age (77) and Biden's age (81) become focal points in the election discourse, with both candidates trading jabs."],
    ["Navalny died on February 16, 2024, in a Siberian prison colony while serving a 19-year sentence. Maria Pevchikh claimed that Navalny was about to be freed in a prisoner swap involving Vadim Krasikov and two US citizens. Kremlin spokesperson Dmitry Peskov denied knowledge of any such agreements or negotiations. Yulia Navalnaya accused Vladimir Putin of killing her husband and demanded the return of Navalny's body. Navalny's body was reportedly held for 'chemical analysis,' leading to suspicions of poisoning with Novichok. International leaders, including the EU and the US, held Putin responsible for Navalny's death and considered potential sanctions. Navalny's opposition activities, his previous poisoning in 2020, and the role of economic policies, particularly mass privatization, in shaping the conditions leading to his tragic death."],
    ["Israeli special forces entered Nasser Hospital in Khan Younis, Gaza, based on 'credible intelligence' that Hamas held captives from the October 7 attacks. The operation aimed to locate bodies of Israeli hostages. Prime Minister Netanyahu vowed to proceed with an invasion of Rafah. Prince William expressed concern over the conflict, emphasizing the need for humanitarian support, hostage release, and permanent peace. The report should cover these key points, incorporating relevant details and developments."],
    ["""Alaska Airlines Boeing 737 Max 9 door blowout incident.
Grounding of 171 Boeing 737 Max 9 planes in the U.S.
Boeing's acknowledgment of the mistake by Dave Calhoun.
Details about the missing 27kg door "plug" and the four crucial bolts.
Rapid loss of cabin pressure and the National Transportation Safety Board's preliminary findings.
Highlight of manufacturing defects and quality control issues at Boeing.
Mention of executive changes at Boeing.
The strain on Boeing's relationship with airlines and increased safety concerns. The missing section of the plane was retrieved from the back garden of a Portland teacher.
Alaska Airlines had placed restrictions on the aircraft following pressurization warnings before the incident.
Boeing's president and CEO, Dave Calhoun, reassured Boeing staff, promising 100% transparency in addressing the mistake.
Boeing organized a "Quality Stand Down" at its 737 factory in Renton, Washington, focusing on quality issues.
The National Safety Transportation Board (NTSB) is expected to release its preliminary findings on the Alaska Airlines 737 Max 9 incident.
Critics argue that Boeing's shift in corporate culture towards prioritizing profits over safety started after the 1997 merger with McDonnell Douglas.
Boeing's move away from its Pacific Northwest roots, the shift in leadership with financial backgrounds, and its reliance on suppliers are highlighted.
The report emphasizes the impact of cost-cutting measures on Boeing's engineering and manufacturing, leading to various safety and quality issues.
Boeing's financial struggles, including cumulative net losses of more than $26 billion over the last five years, are outlined.
A new FAA report finds major issues with Boeing's safety culture, describing it as "inadequate" and "confusing."
The report identifies gaps in Boeing's safety journey, highlighting employee confusion, inconsistent reporting channels, and a fear of retaliation.
Boeing's safety improvements since 2019, such as establishing an aerospace safety committee and a chief aerospace safety office, are mentioned."""],
    ["Elon Musk's desire for increased control over Tesla has sparked discussions about his stake and concerns regarding potential takeovers. Elon Musk, with approximately 13% ownership in Tesla, expressed a desire for increased control due to concerns about potential takeovers, particularly by 'dubious interests.' He highlighted unease regarding Tesla's investments in artificial intelligence and stated a preference for a 25% voting control. The comments led to a 'firestorm' for Tesla, impacting its self-driving and automated capabilities narrative. Additionally, ongoing legal action surrounds a 10-year pay package granted to Musk in 2018, valued at about $55 billion."],
    ["Hollywood remains unfazed by OpenAI's Sora, the latest text-to-video generator, despite its ability to quickly produce high-quality videos from simple prompts. Industry experts express concerns about potential job displacement and creative challenges, with some figures like Tyler Perry already taking action. Sora, though not broadly available yet, has garnered attention for its impressive capabilities, generating hyperrealistic videos up to 1 minute long. Sora is the latest leap in artificial intelligence software, allowing users to create staggeringly realistic videos from a simple worded prompt"],
    ["Prince William's unexpected withdrawal from the memorial service for the late former King Constantine of Greece was attrubyted to a 'personal matter'.  In the absence of Prince William, Queen Camilla led the Royal Family at St George's Chapel in Windsor Castle. Despite concerns, royal sources assured the public that there was no cause for alarm. "],
    ["As Ukraine commemorates the second anniversary of Russia's invasion, Europe confronts pressing questions about sustaining financial support for the war-torn nation. The war, deadlocked for some time, witnessed Ukraine's recent withdrawal from the key town of Avdiivka, marking its worst defeat since May. Financial aid from the United States, currently awaiting House approval, is crucial, with the EU and NATO unity showing signs of strain. Despite spending over $100 billion on Ukraine's defense, Europe is grappling with the challenge of political fatigue and the potential impact of upcoming elections. "]
    ]

In [8]:
columns = ["System", "Faithfulness", "Answer Relevancy", "Context Precision", "Context Recall", "Answer Similarity", "Answer Correctness"]
results_df = pd.DataFrame(columns=columns)

In [9]:
max_average = 0
def find_highest(average_score):
    global max_average
    if average_score > max_average:
        max_average = average_score
        print("This is the new best value!")

In [10]:
def dictionary(result):
    dict_result = dict(result)
    average_score = sum(dict_result.values()) / len(dict_result)
    print(f"The average score is: {average_score}")
    find_highest(average_score)
    return average_score

In [11]:
def evaluate_system(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_rag(questions, answers, contexts, ground_truths)
    average = dictionary(result)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Faithfulness": result["faithfulness"],
        "Answer Relevancy": result["answer_relevancy"],
        "Context Precision": result["context_precision"],
        "Context Recall": result["context_recall"],
        "Answer Similarity": result["answer_similarity"],
        "Answer Correctness": result["answer_correctness"],
        "Average": average
    }
    df_system_results = pd.DataFrame([system_results])
    return df_system_results

In [12]:
def evaluate_LLM(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_rag(questions, answers, contexts, ground_truths)
    average = dictionary(result)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Faithfulness": result["faithfulness"],
        "Answer Relevancy": result["answer_relevancy"],
        "Context Precision": np.nan,
        "Context Recall": np.nan,
        "Answer Similarity": result["answer_similarity"],
        "Answer Correctness": result["answer_correctness"],
        "Average": average
    }
    df_llm_results = pd.DataFrame([system_results])
    return df_llm_results

In [14]:
results_df = pd.read_csv(f"../news/results/results_reporting.csv")

In [15]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.909415,0.59349,,,0.910937,0.546557,0.670567
1,GPT-4,0.476318,1.0,,,0.674566,0.853289,0.677246
2,Naive,0.95,0.753455,0.989583,0.975,0.955112,0.623373,0.874421
3,Recursive,0.95,0.75286,0.979167,0.932143,0.955112,0.626853,0.866023
4,"Chunk 500, overlap 0%",0.948148,0.881547,0.979167,0.874188,0.953518,0.61622,0.875465
5,"Chunk 500, overlap 5%",0.937698,0.761564,0.979167,0.905438,0.953697,0.62387,0.860239
6,"Chunk 500, overlap 10%",0.954167,0.751947,0.954861,0.849188,0.960167,0.647432,0.85296
7,"Chunk 500, overlap 15%",1.0,0.758759,0.965278,0.893327,0.958609,0.619789,0.86596
8,"Chunk 500, overlap 20%",0.97619,0.748401,0.975694,0.874577,0.959806,0.606048,0.856786
9,"Chunk 1000, overlap 0%",0.96131,0.731361,0.989583,0.94375,0.952565,0.577283,0.859309


### General answer by LLMs

In [12]:
template = """{question}"""
prompt = ChatPromptTemplate.from_template(template)

In [13]:
llm_chain =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm}
)
llm_chain_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm_gpt4}
)

In [14]:
answers_llm = []
contexts_llm = [[""],[""],[""],[""],[""],[""],[""],[""]]

In [15]:
for query in questions:
    response = llm_chain.invoke({"question": query})
    answers_llm.append(response["response"].content)

In [16]:
llm_results = evaluate_LLM("GPT-3.5", questions, answers_llm, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results], ignore_index=True)
print(llm_results)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   2%|▏         | 1/48 [00:01<00:47,  1.02s/it]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  15%|█▍        | 7/48 [00:02<00:14,  2.88it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 48/48 [00:09<00:00,  5.05it/s]


The average score is: 0.6705668424093935
This is the new best value!
    System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  GPT-3.5      0.909415           0.59349                NaN             NaN   

   Answer Similarity  Answer Correctness   Average  
0           0.910937            0.546557  0.670567  


  results_df = pd.concat([results_df, llm_results], ignore_index=True)


In [17]:
answers_llm_gpt4 = []
for query in questions:
    response = llm_chain_gpt4.invoke({"question": query})
    answers_llm_gpt4.append(response["response"].content)
llm_results_gpt4 = evaluate_LLM("GPT-4",questions, answers_llm_gpt4, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results_gpt4], ignore_index=True)
print(llm_results_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  31%|███▏      | 15/48 [00:03<00:05,  5.82it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 48/48 [00:12<00:00,  3.86it/s]


The average score is: 0.6772455000098149
This is the new best value!
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  GPT-4      0.476318               1.0                NaN             NaN   

   Answer Similarity  Answer Correctness   Average  
0           0.674566            0.853289  0.677246  


In [18]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.909415,0.59349,,,0.910937,0.546557,0.670567
1,GPT-4,0.476318,1.0,,,0.674566,0.853289,0.677246


# RAG

### Naive RAG

In [23]:
def retrieval_chain(prompt, retriever, llm):
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
    )
    return retrieval_augmented_qa_chain

In [16]:
loader = DirectoryLoader('../news', glob="./*.txt", loader_cls=TextLoader)
documents = loader.load()

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = CharacterTextSplitter()
chunks = text_splitter.split_documents(documents)
db_naive = Chroma.from_documents(chunks, embeddings_client, persist_directory = "../news/vectordb/naive")
db_naive.persist()
retriever_naive = db_naive.as_retriever()

In [21]:
db_naive = Chroma(persist_directory = "../news/vectordb/naive", embedding_function=embeddings_client)
retriever_naive = db_naive.as_retriever()

In [17]:
template = """{question}. 
Provided context {context}."""
prompt = ChatPromptTemplate.from_template(template)

In [23]:
answers_naive = []
contexts_naive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_naive, llm).invoke({"question": query})
        # Access the response content
        answers_naive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_naive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_naive.append("No answer")
        context_full = retriever_naive.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_naive.append(context_content)

In [24]:
result_naive_rag = evaluate_system("Naive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_naive_rag], ignore_index=True)
print(result_naive_rag)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:13<00:00,  3.62it/s]


The average score is: 0.8744205907920808
This is the new best value!
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  Naive          0.95          0.753455           0.989583           0.975   

   Answer Similarity  Answer Correctness   Average  
0           0.955112            0.623373  0.874421  


### recursive splitter

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
chunks_r = text_splitter.split_documents(documents)
db_basic = Chroma.from_documents(chunks_r, embeddings_client, persist_directory = "../news/vectordb/recursive_basic")
db_basic.persist()
retriever_basic = db_basic.as_retriever()

In [25]:
db_basic = Chroma(persist_directory = "../news/vectordb/recursive_basic", embedding_function=embeddings_client)
retriever_basic = db_basic.as_retriever()

In [26]:
answers_recursive = []
contexts_recursive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_basic, llm).invoke({"question": query})
        # Access the response content
        answers_recursive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_recursive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_recursive.append("No answer")
        context_full = retriever_basic.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_recursive.append(context_content)

In [27]:
result_recursive = evaluate_system("Recursive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_recursive], ignore_index=True)
result_recursive

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:12<00:00,  3.97it/s]


The average score is: 0.866022562327362


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,Recursive,0.95,0.75286,0.979167,0.932143,0.955112,0.626853,0.866023


### chunk sizes

In [18]:
def run_and_evaluate(name, retriever, prompt, llm, results_df):
    answers = []
    contexts_extra = []

    for query in questions:
        try:  
            response = retrieval_chain(prompt, retriever, llm).invoke({"question": query})
            # Access the response content
            answers.append(response["response"].content)
            # Access the context content
            context_content = [context.page_content for context in response["context"]]
            contexts_extra.append(context_content)  
        except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers.append("No answer")
            context_full = retriever.get_relevant_documents(query)
            context_content = [context.page_content for context in context_full]
            contexts_extra.append(context_content)

    result = evaluate_system(name, questions, answers, contexts_extra, ground_truths)
    results_df = pd.concat([results_df, result], ignore_index=True)
    return result, results_df

chunk = 1000

In [None]:
# # THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
# chunks_1000 = text_splitter.split_documents(documents)
# print(len(chunks_1000))
# db_1000 = Chroma.from_documents(chunks_1000, embeddings_client, persist_directory = "../news/vectordb/recursive_1000")
# db_1000.persist()

In [31]:
# db_1000 = Chroma(persist_directory = "../news/vectordb/recursive_1000", embedding_function=embeddings_client)

In [34]:
# # retriever_1000 = db_1000.as_retriever()
# result_1000 = run_and_evaluate(retriever_1000, prompt, llm)
# print("CHUNK SIZE 1000")
# print(result_1000)
# avg_result_1000, dict_result_1000 = dictionary(result_1000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:16<00:00,  2.97it/s]


CHUNK SIZE 1000
{'faithfulness': 0.9750, 'answer_relevancy': 0.7460, 'context_precision': 0.9583, 'context_recall': 0.9009, 'answer_similarity': 0.9595, 'answer_correctness': 0.6084}
The average score is: 0.8580138744596889


chunk = 500

In [33]:
# # THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
# chunks_500 = text_splitter.split_documents(documents)
# print(len(chunks_500))
# db_500 = Chroma.from_documents(chunks_500, embeddings_client, persist_directory = "../news/vectordb/recursive_500")
# db_500.persist()

107


In [35]:
# db_500 = Chroma(persist_directory = "../news/vectordb/recursive_500", embedding_function=embeddings_client)

In [36]:
# retriever_500 = db_500.as_retriever()
# result_500 = run_and_evaluate(retriever_500, prompt, llm)
# print("CHUNK SIZE 500")
# print(result_500)
# avg_result_500, dict_result_500 = dictionary(result_500)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:16<00:00,  2.92it/s]


CHUNK SIZE 500
{'faithfulness': 0.9476, 'answer_relevancy': 0.7588, 'context_precision': 0.9653, 'context_recall': 0.8492, 'answer_similarity': 0.9608, 'answer_correctness': 0.6713}
The average score is: 0.8588243339324139


chunk = 2000

In [35]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 200)
# chunks_2000 = text_splitter.split_documents(documents)
# print(len(chunks_2000))
# db_2000 = Chroma.from_documents(chunks_2000, embeddings_client, persist_directory = "../news/vectordb/recursive_2000")


37


In [37]:
# db_2000 = Chroma(persist_directory = "../news/vectordb/recursive_2000", embedding_function=embeddings_client)

In [38]:
# retriever_2000 = db_2000.as_retriever()
# result_2000 = run_and_evaluate(retriever_2000, prompt, llm)
# print("CHUNK SIZE 2000")
# print(result_2000)
# avg_result_2000, dict_result_2000 = dictionary(result_2000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:16<00:00,  2.85it/s]


CHUNK SIZE 2000
{'faithfulness': 0.9298, 'answer_relevancy': 0.7010, 'context_precision': 0.9896, 'context_recall': 0.9375, 'answer_similarity': 0.9615, 'answer_correctness': 0.5258}
The average score is: 0.8408431168806643


In [55]:
# avg_result_2000, dict_result_2000 = dictionary(result_2000)

The average score is: 0.8408431168806643


chunk = 3000

In [38]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 3000, chunk_overlap = 300)
# chunks_3000 = text_splitter.split_documents(documents)
# print(len(chunks_3000))
# db_3000 = Chroma.from_documents(chunks_3000, embeddings_client, persist_directory = "../news/vectordb/recursive_3000")

32


In [39]:
# db_3000 = Chroma(persist_directory = "../news/vectordb/recursive_3000", embedding_function=embeddings_client)

In [40]:
# retriever_3000 = db_3000.as_retriever()
# result_3000 = run_and_evaluate(retriever_3000, prompt, llm)
# print("CHUNK SIZE 3000")
# print(result_3000)
# avg_result_3000 ,dict_result_3000 = dictionary(result_3000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  96%|█████████▌| 46/48 [00:09<00:00,  2.48it/s]Invalid JSON response. Expected dictionary with key 'Attributed'
Evaluating: 100%|██████████| 48/48 [00:22<00:00,  2.11it/s]


CHUNK SIZE 3000
{'faithfulness': 0.9429, 'answer_relevancy': 0.8032, 'context_precision': 0.9896, 'context_recall': 0.9643, 'answer_similarity': 0.9165, 'answer_correctness': 0.6267}
The average score is: 0.8738612558505032


In [56]:
# avg_result_3000 ,dict_result_3000 = dictionary(result_3000)

The average score is: 0.8738612558505032
This is the new best value!


In [29]:
chunk_sizes = [500, 1000, 2000, 3000]
overlap_percentages = [0, 5, 10, 15, 20]

for chunk_size in chunk_sizes:
    for overlap_percentage in overlap_percentages:
        # Calculate overlap based on percentage
        chunk_overlap = int(chunk_size * overlap_percentage / 100)
        
        # Create text splitter
        # text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        
        # # Split documents
        # chunks = text_splitter.split_documents(documents)
        
        # Print number of chunks
        print(f"Chunk size {chunk_size}, overlap {overlap_percentage}%")
        
        # Create Chroma database
        # db = Chroma.from_documents(chunks, embeddings_client, persist_directory=f"../news/vectordb-edit/chunking_{chunk_size}_{overlap_percentage}")
        db = Chroma(persist_directory = f"../news/vectordb-edit/chunking_{chunk_size}_{overlap_percentage}", embedding_function=embeddings_client)
        db.persist()
        
        # Create retriever
        retriever = db.as_retriever()
        
        # Run and evaluate
        result,results_df = run_and_evaluate(f"Chunk {chunk_size}, overlap {overlap_percentage}%", retriever, prompt, llm, results_df)
        print(f"CHUNK SIZE {chunk_size}, {overlap_percentage}% overlap:")
        print(result)

Chunk size 500, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:13<00:00,  3.61it/s]


The average score is: 0.8754647569443441
This is the new best value!
CHUNK SIZE 500, 0% overlap:
                  System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 0%      0.948148          0.881547           0.979167   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.874188           0.953518             0.61622  0.875465  
Chunk size 500, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:13<00:00,  3.68it/s]


The average score is: 0.8602391415249621
CHUNK SIZE 500, 5% overlap:
                  System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 5%      0.937698          0.761564           0.979167   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.905438           0.953697             0.62387  0.860239  
Chunk size 500, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:11<00:00,  4.03it/s]


The average score is: 0.852960354597658
CHUNK SIZE 500, 10% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 10%      0.954167          0.751947           0.954861   

   Context Recall  Answer Similarity  Answer Correctness  Average  
0        0.849188           0.960167            0.647432  0.85296  
Chunk size 500, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:12<00:00,  3.95it/s]


The average score is: 0.865960388223181
CHUNK SIZE 500, 15% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 15%           1.0          0.758759           0.965278   

   Context Recall  Answer Similarity  Answer Correctness  Average  
0        0.893327           0.958609            0.619789  0.86596  
Chunk size 500, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-448' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-449' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-450' coro=<AsyncClient.aclose() done, defined

The average score is: 0.8567862081399237
CHUNK SIZE 500, 20% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 20%       0.97619          0.748401           0.975694   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.874577           0.959806            0.606048  0.856786  
Chunk size 1000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:13<00:00,  3.58it/s]


The average score is: 0.8593085846047387
CHUNK SIZE 1000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 0%       0.96131          0.731361           0.989583   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0         0.94375           0.952565            0.577283  0.859309  
Chunk size 1000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:13<00:00,  3.44it/s]


The average score is: 0.8861483593755913
This is the new best value!
CHUNK SIZE 1000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 5%      0.959524          0.858199                1.0   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.932143            0.96074            0.606284  0.886148  
Chunk size 1000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:13<00:00,  3.68it/s]


The average score is: 0.8587212230494434
CHUNK SIZE 1000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 10%      0.964583          0.742219           0.954861   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.925893           0.961111            0.603661  0.858721  
Chunk size 1000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:13<00:00,  3.63it/s]


The average score is: 0.878509448569548
CHUNK SIZE 1000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 15%       0.96875          0.857516           0.954861   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.925893           0.957612            0.606425  0.878509  
Chunk size 1000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:14<00:00,  3.29it/s]


The average score is: 0.8617967486193542
CHUNK SIZE 1000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 20%         0.975          0.751746           0.958333   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.900893           0.953301            0.631507  0.861797  
Chunk size 2000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:13<00:00,  3.63it/s]


The average score is: 0.8539123275997493
CHUNK SIZE 2000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 0%      0.947619          0.703084            0.96875   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0          0.9125           0.950657            0.640864  0.853912  
Chunk size 2000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:14<00:00,  3.34it/s]


The average score is: 0.8455333514249714
CHUNK SIZE 2000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 5%       0.95102          0.613685           0.989583   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0          0.9125           0.947995            0.658416  0.845533  
Chunk size 2000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:14<00:00,  3.35it/s]


The average score is: 0.8521551866742114
CHUNK SIZE 2000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 10%         0.975           0.69731           0.989583   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0          0.9125           0.960437              0.5781  0.852155  
Chunk size 2000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:15<00:00,  3.05it/s]


The average score is: 0.8746659123758431
CHUNK SIZE 2000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 15%      0.952381          0.806644           0.989583   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0         0.96875           0.959554            0.571083  0.874666  
Chunk size 2000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:13<00:00,  3.47it/s]


The average score is: 0.8561608298286497
CHUNK SIZE 2000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 20%      0.957143          0.699601           0.979167   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0         0.96875           0.958829            0.573475  0.856161  
Chunk size 3000, overlap 0%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  96%|█████████▌| 46/48 [00:09<00:00,  2.07it/s]Invalid JSON response. Expected dictionary with key 'Attributed'
Evaluating: 100%|██████████| 48/48 [00:15<00:00,  3.01it/s]


The average score is: 0.827443979604254
CHUNK SIZE 3000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 0%      0.924339          0.710273                1.0   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.857143           0.925189            0.547721  0.827444  
Chunk size 3000, overlap 5%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-1063' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-1064' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-1065' coro=<AsyncClient.aclose() done, defi

The average score is: 0.832099094723067
CHUNK SIZE 3000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 5%      0.908163          0.703264                1.0   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.828571           0.924039            0.628557  0.832099  
Chunk size 3000, overlap 10%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  96%|█████████▌| 46/48 [00:08<00:00,  4.07it/s]Invalid JSON response. Expected dictionary with key 'Attributed'
Evaluating: 100%|██████████| 48/48 [00:18<00:00,  2.57it/s]


The average score is: 0.8522771425673853
CHUNK SIZE 3000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 10%         0.975          0.811026           0.989583   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.828571           0.916237            0.593245  0.852277  
Chunk size 3000, overlap 15%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  94%|█████████▍| 45/48 [00:07<00:00,  6.34it/s]Invalid JSON response. Expected dictionary with key 'Attributed'
Evaluating: 100%|██████████| 48/48 [00:13<00:00,  3.53it/s]


The average score is: 0.8737519546977928
CHUNK SIZE 3000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 15%          0.96          0.805409                1.0   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.935714           0.924271            0.617117  0.873752  
Chunk size 3000, overlap 20%


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  94%|█████████▍| 45/48 [00:08<00:00,  4.23it/s]Invalid JSON response. Expected dictionary with key 'Attributed'
Evaluating: 100%|██████████| 48/48 [00:13<00:00,  3.44it/s]


The average score is: 0.82734673693958
CHUNK SIZE 3000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 20%          0.96          0.704375                1.0   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.828571           0.924642            0.546492  0.827347  


In [31]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.909415,0.59349,,,0.910937,0.546557,0.670567
1,GPT-4,0.476318,1.0,,,0.674566,0.853289,0.677246
2,Naive,0.95,0.753455,0.989583,0.975,0.955112,0.623373,0.874421
3,Recursive,0.95,0.75286,0.979167,0.932143,0.955112,0.626853,0.866023
4,"Chunk 500, overlap 0%",0.948148,0.881547,0.979167,0.874188,0.953518,0.61622,0.875465
5,"Chunk 500, overlap 5%",0.937698,0.761564,0.979167,0.905438,0.953697,0.62387,0.860239
6,"Chunk 500, overlap 10%",0.954167,0.751947,0.954861,0.849188,0.960167,0.647432,0.85296
7,"Chunk 500, overlap 15%",1.0,0.758759,0.965278,0.893327,0.958609,0.619789,0.86596
8,"Chunk 500, overlap 20%",0.97619,0.748401,0.975694,0.874577,0.959806,0.606048,0.856786
9,"Chunk 1000, overlap 0%",0.96131,0.731361,0.989583,0.94375,0.952565,0.577283,0.859309


In [30]:
highest_average = results_df["Average"].max()
highest_average

0.8861483593755913

In [32]:
results_df.to_csv(f"../news/results/results_reporting.csv", index=False)

### now time to look for different top-k

Note: We continue with the size chunk of 1000, overlap 5% as it had the highest average score

In [19]:
db_k = Chroma(persist_directory = "../news/vectordb-edit/chunking_1000_5", embedding_function=embeddings_client)

In [30]:
k_values = [2, 3, 5]

# Iterate over different k values
for k in k_values:
    # Create retriever with k value
    retriever = db_k.as_retriever(search_kwargs={"k": k})
    
    # Run and evaluate
    result,results_df = run_and_evaluate(f"Chunk size 1000, overlap 5%, K={k}", retriever, prompt, llm, results_df)
    print(f"Results for K={k}:")
    print(result)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:10<00:00,  4.59it/s]


The average score is: 0.8452441148842914
This is the new best value!
Results for K=2:
                             System  Faithfulness  Answer Relevancy  \
0  Chunk size 1000, overlap 5%, K=2      0.900794          0.770222   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0                1.0        0.814087           0.959437            0.626925   

    Average  
0  0.845244  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:12<00:00,  3.97it/s]


The average score is: 0.8409128191342289
Results for K=3:
                             System  Faithfulness  Answer Relevancy  \
0  Chunk size 1000, overlap 5%, K=3      0.894345          0.734796   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0                1.0        0.878472           0.953352            0.584511   

    Average  
0  0.840913  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  10%|█         | 5/48 [00:03<00:24,  1.75it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  58%|█████▊    | 28/48 [00:04<00:02,  9.37it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 48/48 [00:14<00:00,  3.32it/s]


The average score is: 0.8879916512753878
This is the new best value!
Results for K=5:
                             System  Faithfulness  Answer Relevancy  \
0  Chunk size 1000, overlap 5%, K=5      0.752391          0.909552   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.933543         0.96875           0.941679            0.822034   

    Average  
0  0.887992  


In [31]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.909415,0.59349,,,0.910937,0.546557,0.670567
1,GPT-4,0.476318,1.0,,,0.674566,0.853289,0.677246
2,Naive,0.95,0.753455,0.989583,0.975,0.955112,0.623373,0.874421
3,Recursive,0.95,0.75286,0.979167,0.932143,0.955112,0.626853,0.866023
4,"Chunk 500, overlap 0%",0.948148,0.881547,0.979167,0.874188,0.953518,0.61622,0.875465
5,"Chunk 500, overlap 5%",0.937698,0.761564,0.979167,0.905438,0.953697,0.62387,0.860239
6,"Chunk 500, overlap 10%",0.954167,0.751947,0.954861,0.849188,0.960167,0.647432,0.85296
7,"Chunk 500, overlap 15%",1.0,0.758759,0.965278,0.893327,0.958609,0.619789,0.86596
8,"Chunk 500, overlap 20%",0.97619,0.748401,0.975694,0.874577,0.959806,0.606048,0.856786
9,"Chunk 1000, overlap 0%",0.96131,0.731361,0.989583,0.94375,0.952565,0.577283,0.859309


In [32]:
highest_average = results_df["Average"].max()
highest_average

0.8879916512753878

In [57]:
# retriever_2 = db_3000.as_retriever(search_kwargs={"k": 2})
# result_2 = run_and_evaluate(retriever_2, prompt, llm)
# print("CHUNK SIZE 1000, K=2")
# print(result_2)
# avg_result_2, dict_result_2 = dictionary(result_2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:14<00:00,  3.29it/s]


CHUNK SIZE 1000, K=2
{'faithfulness': 0.9667, 'answer_relevancy': 0.7627, 'context_precision': 1.0000, 'context_recall': 0.8884, 'answer_similarity': 0.9599, 'answer_correctness': 0.6383}
The average score is: 0.8693258385468209


In [58]:
# retriever_3 = db_3000.as_retriever(search_kwargs={"k": 3})
# result_3 = run_and_evaluate(retriever_3, prompt, llm)
# print("CHUNK SIZE 1000, K=3")
# print(result_3)
# avg_result_3, dict_result_3 = dictionary(result_3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:17<00:00,  2.70it/s]


CHUNK SIZE 1000, K=3
{'faithfulness': 0.9435, 'answer_relevancy': 0.7420, 'context_precision': 1.0000, 'context_recall': 0.9931, 'answer_similarity': 0.9548, 'answer_correctness': 0.6497}
The average score is: 0.8805190156333335
This is the new best value!


In [60]:
# retriever_5 = db_3000.as_retriever(search_kwargs={"k": 5})
# result_5 = run_and_evaluate(retriever_5, prompt, llm)
# print("CHUNK SIZE 1000, K=5")
# print(result_5)
# avg_result_5, dict_result_5 = dictionary(result_5)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/48 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8369 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Evaluating:   4%|▍         | 2/48 [00:02<00:52,  1.14s/it]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8510 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_ex

CHUNK SIZE 1000, K=5
{'faithfulness': 0.7877, 'answer_relevancy': 0.6899, 'context_precision': 0.9040, 'context_recall': 0.9881, 'answer_similarity': 0.9665, 'answer_correctness': 0.6658}
The average score is: 0.8336666858719243


### look for different retrievers

3 chunks was the best score

#### parent document retriever

In [33]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

parent_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap = 200)
child_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents",persist_directory = "../vectordb-edit/parent_report", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
parent_document_retriever.add_documents(documents)
result_parent, results_df = run_and_evaluate(f"Parent Retriever 1000-200", parent_document_retriever, prompt, llm, results_df)
print(result_parent)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:12<00:00,  3.77it/s]


The average score is: 0.8431039501098861
                      System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 1000-200      0.883333          0.767563   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.979167            0.85            0.95619            0.622371   

    Average  
0  0.843104  


In [34]:
parent_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap = 50)
child_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_small",persist_directory = "../vectordb-edit/parent_small_report", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_small = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_small,
    parent_splitter=parent_splitter_small,
)
parent_document_retriever_small.add_documents(documents)
result_parent_small, results_df = run_and_evaluate(f"Parent Retriever 500-100", parent_document_retriever_small, prompt, llm, results_df)
print(result_parent_small)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:13<00:00,  3.66it/s]


The average score is: 0.7855730785440166
                     System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 500-100      0.877551           0.54157   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0                1.0        0.751091           0.958994            0.584232   

    Average  
0  0.785573  


In [35]:
parent_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1500, chunk_overlap = 150)
child_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_large",persist_directory = "../vectordb-edit/parent_large_report", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_large = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_large,
    parent_splitter=parent_splitter_large,
)
parent_document_retriever_large.add_documents(documents)
result_parent_large , results_df = run_and_evaluate(f"Parent Retriever 1500-200", parent_document_retriever_small, prompt, llm, results_df)
print(result_parent_large)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:11<00:00,  4.03it/s]


The average score is: 0.8323096204874614
                      System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 1500-200      0.857143          0.782169   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0                1.0         0.76498           0.955959            0.633607   

   Average  
0  0.83231  


#### Maximum marginal relevance retrieval

In [36]:
retriever_mmr = db_k.as_retriever(search_type="mmr",search_kwargs={"k": 5})
result_mmr, results_df = run_and_evaluate(f"MMR", retriever_mmr, prompt, llm, results_df)
print("Marginal relevance")
print(result_mmr)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:10<00:00,  4.41it/s]


The average score is: 0.8393036864489072
Marginal relevance
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0    MMR      0.853571          0.767481                1.0        0.919643   

   Answer Similarity  Answer Correctness   Average  
0           0.938038            0.557089  0.839304  


#### BM25

In [37]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 50)
chunks_1000 = text_splitter.split_documents(documents)

In [38]:
retriever_bm25 = BM25Retriever.from_documents(chunks_1000, search_kwargs = {"k": 5})
result_bm25, results_df = run_and_evaluate(f"BM25", retriever_bm25, prompt, llm, results_df)
print("BM25")
print(result_bm25)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:12<00:00,  3.91it/s]


The average score is: 0.8690658952776448
BM25
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0   BM25           1.0          0.734177            0.96875        0.933333   

   Answer Similarity  Answer Correctness   Average  
0           0.960589            0.617547  0.869066  


In [39]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.909415,0.59349,,,0.910937,0.546557,0.670567
1,GPT-4,0.476318,1.0,,,0.674566,0.853289,0.677246
2,Naive,0.95,0.753455,0.989583,0.975,0.955112,0.623373,0.874421
3,Recursive,0.95,0.75286,0.979167,0.932143,0.955112,0.626853,0.866023
4,"Chunk 500, overlap 0%",0.948148,0.881547,0.979167,0.874188,0.953518,0.61622,0.875465
5,"Chunk 500, overlap 5%",0.937698,0.761564,0.979167,0.905438,0.953697,0.62387,0.860239
6,"Chunk 500, overlap 10%",0.954167,0.751947,0.954861,0.849188,0.960167,0.647432,0.85296
7,"Chunk 500, overlap 15%",1.0,0.758759,0.965278,0.893327,0.958609,0.619789,0.86596
8,"Chunk 500, overlap 20%",0.97619,0.748401,0.975694,0.874577,0.959806,0.606048,0.856786
9,"Chunk 1000, overlap 0%",0.96131,0.731361,0.989583,0.94375,0.952565,0.577283,0.859309


#### Ensambler - Hybrid

In [40]:
retriever_k = db_k.as_retriever(search_kwargs={"k": 5})

In [42]:
ensemble_retriever_1 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_k], weights=[0.75, 0.25])
result_ensemble1, results_df = run_and_evaluate(f"Ensambler 1", ensemble_retriever_1, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble1)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  54%|█████▍    | 26/48 [00:04<00:02,  8.77it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 48/48 [00:15<00:00,  3.03it/s]


The average score is: 0.8669132839109005
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 1          0.95          0.826975            0.96875   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.982143           0.930705            0.542907  0.866913  


In [43]:
ensemble_retriever_2 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_k], weights=[0.5, 0.5])
result_ensemble2, results_df = run_and_evaluate(f"Ensambler 2", ensemble_retriever_2, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  31%|███▏      | 15/48 [00:03<00:06,  5.44it/s]Runner in Executor raised an exception
ValueError: Azure has not provided the response due to a content filter being triggered
Evaluating: 100%|██████████| 48/48 [00:14<00:00,  3.41it/s]


The average score is: 0.8701906592436711
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 2      0.926496           0.77387           0.924107   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0         0.96875           0.906679            0.721241  0.870191  


In [44]:
ensemble_retriever_3 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_k], weights=[0.25,0.75])
result_ensemble3, results_df = run_and_evaluate(f"Ensambler 3", ensemble_retriever_3, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:13<00:00,  3.57it/s]


The average score is: 0.8371376589358781
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 3      0.947917          0.615914                1.0   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.919643           0.941465            0.597887  0.837138  


#### Multi-stage - reranker

In [72]:
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [73]:
retriever_context = retriever_3
compressor = CohereRerank(top_n = 3)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever_context
)

result_compression = run_and_evaluate(compression_retriever, prompt, llm)
print("Reranker")
print(result_compression)
avg_result_compression, dict_result_compression = dictionary(result_compression)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:18<00:00,  2.66it/s]


Reranker
{'faithfulness': 0.9009, 'answer_relevancy': 0.8555, 'context_precision': 1.0000, 'context_recall': 0.9443, 'answer_similarity': 0.9532, 'answer_correctness': 0.6482}
The average score is: 0.8836811674501047
This is the new best value!


#### creating context by remaking the query

In [45]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.909415,0.59349,,,0.910937,0.546557,0.670567
1,GPT-4,0.476318,1.0,,,0.674566,0.853289,0.677246
2,Naive,0.95,0.753455,0.989583,0.975,0.955112,0.623373,0.874421
3,Recursive,0.95,0.75286,0.979167,0.932143,0.955112,0.626853,0.866023
4,"Chunk 500, overlap 0%",0.948148,0.881547,0.979167,0.874188,0.953518,0.61622,0.875465
5,"Chunk 500, overlap 5%",0.937698,0.761564,0.979167,0.905438,0.953697,0.62387,0.860239
6,"Chunk 500, overlap 10%",0.954167,0.751947,0.954861,0.849188,0.960167,0.647432,0.85296
7,"Chunk 500, overlap 15%",1.0,0.758759,0.965278,0.893327,0.958609,0.619789,0.86596
8,"Chunk 500, overlap 20%",0.97619,0.748401,0.975694,0.874577,0.959806,0.606048,0.856786
9,"Chunk 1000, overlap 0%",0.96131,0.731361,0.989583,0.94375,0.952565,0.577283,0.859309


In [46]:
highest_average = results_df["Average"].max()
highest_average

0.8879916512753878

In [47]:
template_context = "Generate a search query to fetch the relevant documents using the user's {question}. Craft a query that specifically targets the keywords in the question. In the answer provide only the query."
prompt_context = ChatPromptTemplate.from_template(template_context)

In [None]:
retriever_k

In [48]:
answers_final = []
contexts_final = []
# retriever_context_q = EnsembleRetriever(retrievers=[retriever_bm25, retriever_3], weights=[0.5, 0.5])
llm_for_context =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm}
)
for query in questions:
    response_check = llm_for_context.invoke({"question": query})
    search_query = response_check["response"].content
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
)
    docs = retriever_k.get_relevant_documents(search_query)
    formatted_docs = []
    for doc in docs:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final.append(response["response"].content)
            contexts_final.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final.append("No answer")
            contexts_final.append(formatted_docs)


result_search_query = evaluation_rag(questions, answers_final, contexts_final, ground_truths)
result_search_query

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:13<00:00,  3.56it/s]


{'faithfulness': 1.0000, 'answer_relevancy': 0.7411, 'context_precision': 1.0000, 'context_recall': 0.9571, 'answer_similarity': 0.9625, 'answer_correctness': 0.6264}

In [49]:
average = dictionary(result_search_query)
    # Create a dictionary to store the results
system_results = {
        "System": "Search query",
        "Faithfulness": result_search_query["faithfulness"],
        "Answer Relevancy": result_search_query["answer_relevancy"],
        "Context Precision": result_search_query["context_precision"],
        "Context Recall": result_search_query["context_recall"],
        "Answer Similarity": result_search_query["answer_similarity"],
        "Answer Correctness": result_search_query["answer_correctness"],
        "Average": average
    }
df_result_search_query = pd.DataFrame([system_results])

The average score is: 0.8811896338938401


In [50]:
results_df = pd.concat([results_df, df_result_search_query], ignore_index=True)
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.909415,0.59349,,,0.910937,0.546557,0.670567
1,GPT-4,0.476318,1.0,,,0.674566,0.853289,0.677246
2,Naive,0.95,0.753455,0.989583,0.975,0.955112,0.623373,0.874421
3,Recursive,0.95,0.75286,0.979167,0.932143,0.955112,0.626853,0.866023
4,"Chunk 500, overlap 0%",0.948148,0.881547,0.979167,0.874188,0.953518,0.61622,0.875465
5,"Chunk 500, overlap 5%",0.937698,0.761564,0.979167,0.905438,0.953697,0.62387,0.860239
6,"Chunk 500, overlap 10%",0.954167,0.751947,0.954861,0.849188,0.960167,0.647432,0.85296
7,"Chunk 500, overlap 15%",1.0,0.758759,0.965278,0.893327,0.958609,0.619789,0.86596
8,"Chunk 500, overlap 20%",0.97619,0.748401,0.975694,0.874577,0.959806,0.606048,0.856786
9,"Chunk 1000, overlap 0%",0.96131,0.731361,0.989583,0.94375,0.952565,0.577283,0.859309


### change model to GPT-4

In [51]:
top_3_highest = results_df.nlargest(3, "Average")
top_3_highest

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
26,"Chunk size 1000, overlap 5%, K=5",0.752391,0.909552,0.933543,0.96875,0.941679,0.822034,0.887992
10,"Chunk 1000, overlap 5%",0.959524,0.858199,1.0,0.932143,0.96074,0.606284,0.886148
35,Search query,1.0,0.741125,1.0,0.957143,0.962483,0.626387,0.88119


In [52]:
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [53]:
result_gpt4_1000_5_5, results_df = run_and_evaluate(f"Chunk size 1000, overlap 5%, K=5, GPT-4", retriever_k, prompt, llm_gpt4, results_df)
result_gpt4_1000_5_5

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:14<00:00,  3.32it/s]


The average score is: 0.8486401561377112


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk size 1000, overlap 5%, K=5, GPT-4",0.89983,0.738073,1.0,0.925893,0.948862,0.579183,0.84864


In [54]:
retriever_1000 = db_k.as_retriever()
result_gpt4_1000_5, results_df = run_and_evaluate(f"Chunk size 1000, overlap 5%, GPT-4", retriever_1000, prompt, llm_gpt4, results_df)
result_gpt4_1000_5

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 48/48 [00:13<00:00,  3.53it/s]


The average score is: 0.842305289860832


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk size 1000, overlap 5%, GPT-4",0.883681,0.73443,1.0,0.932143,0.948508,0.555071,0.842305


In [55]:
# SEARCH QUERY
answers_final = []
contexts_final = []
# retriever_context_q = EnsembleRetriever(retrievers=[retriever_bm25, retriever_3], weights=[0.5, 0.5])
llm_for_context =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm_gpt4}
)
for query in questions:
    response_check = llm_for_context.invoke({"question": query})
    search_query = response_check["response"].content
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm_gpt4, "context": itemgetter("context")}
)
    docs = retriever_k.get_relevant_documents(search_query)
    formatted_docs = []
    for doc in docs:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final.append(response["response"].content)
            contexts_final.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final.append("No answer")
            contexts_final.append(formatted_docs)


result_search_query_gpt4 = evaluation_rag(questions, answers_final, contexts_final, ground_truths)
average = dictionary(result_search_query_gpt4)
    # Create a dictionary to store the results
system_results_gpt4 = {
        "System": "Search query GPT-4",
        "Faithfulness": result_search_query_gpt4["faithfulness"],
        "Answer Relevancy": result_search_query_gpt4["answer_relevancy"],
        "Context Precision": result_search_query_gpt4["context_precision"],
        "Context Recall": result_search_query_gpt4["context_recall"],
        "Answer Similarity": result_search_query_gpt4["answer_similarity"],
        "Answer Correctness": result_search_query_gpt4["answer_correctness"],
        "Average": average
    }
df_result_search_query_gpt4 = pd.DataFrame([system_results_gpt4])
results_df = pd.concat([results_df, df_result_search_query_gpt4], ignore_index=True)
results_df

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  56%|█████▋    | 27/48 [00:05<00:03,  6.52it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 48/48 [00:13<00:00,  3.46it/s]


The average score is: 0.8737523984881251


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.909415,0.59349,,,0.910937,0.546557,0.670567
1,GPT-4,0.476318,1.0,,,0.674566,0.853289,0.677246
2,Naive,0.95,0.753455,0.989583,0.975,0.955112,0.623373,0.874421
3,Recursive,0.95,0.75286,0.979167,0.932143,0.955112,0.626853,0.866023
4,"Chunk 500, overlap 0%",0.948148,0.881547,0.979167,0.874188,0.953518,0.61622,0.875465
5,"Chunk 500, overlap 5%",0.937698,0.761564,0.979167,0.905438,0.953697,0.62387,0.860239
6,"Chunk 500, overlap 10%",0.954167,0.751947,0.954861,0.849188,0.960167,0.647432,0.85296
7,"Chunk 500, overlap 15%",1.0,0.758759,0.965278,0.893327,0.958609,0.619789,0.86596
8,"Chunk 500, overlap 20%",0.97619,0.748401,0.975694,0.874577,0.959806,0.606048,0.856786
9,"Chunk 1000, overlap 0%",0.96131,0.731361,0.989583,0.94375,0.952565,0.577283,0.859309


In [56]:
results_df.to_csv(f"../news/results/results_reporting.csv", index=False)