In [41]:
import dotenv
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings
dotenv.load_dotenv()
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness,
)
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
import os
from langchain.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
import getpass
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever

In [2]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL")
EMBEDDING_DEPLOYMENT = os.environ.get("EMBEDDING_DEPLOYMENT")
OPENAI_MODEL_GPT4 = os.environ.get("OPENAI_MODEL_GPT4")
OPENAI_DEPLOYMENT_GPT4 = os.environ.get("OPENAI_DEPLOYMENT_GPT4")

In [3]:
embeddings_client = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_version=OPENAI_API_VERSION)
llm = AzureChatOpenAI(model_name=OPENAI_MODEL, azure_deployment=OPENAI_DEPLOYMENT,temperature=0)
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [4]:
def evaluation_llm(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
    )
    return result

In [5]:
def evaluation_rag(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  # most likely
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

In [6]:
questions = [
    "Finish writing the following short report: On February 24, 2024, former President Donald Trump secured a significant victory in the South Carolina Republican primary, defeating Nikki Haley in her home state. This report aims to analyze the key factors leading to Trump's success, the impact on the ongoing presidential race, and potential challenges for other candidates.",
    "Finish writing the following short report: Amidst the mysterious death of Russian opposition leader Alexei Navalny on February 16, 2024, this report investigates the claims made by his colleague, Maria Pevchikh, suggesting that Navalny was on the brink of being freed in a prisoner swap. The alleged plan involved exchanging Navalny for Russian hitman Vadim Krasikov, serving a life sentence in Germany, and two US citizens held in Russia. The report delves into the events leading up to Navalny's demise, the proposed prisoner swap, and the geopolitical implications surrounding this tragic incident.",
    "Please provide a concise report on the Israel-Hamas conflict, focusing on the recent Israeli special forces operation at Nasser Hospital in Khan Younis, Gaza. Include information on the reasons for the operation, international reactions, and the current state of the conflict.",
    "Finish writing the following report: In a recent incident involving an Alaska Airlines Boeing 737 Max 9, a door blowout occurred shortly after take-off, prompting the grounding of 171 planes of the same model in the United States. Boeing's president and CEO, Dave Calhoun, publicly admitted the company's mistake, acknowledging that a missing 27kg door 'plug' was not properly secured.",
    "Finish writing the following short report: Elon Musk's desire for increased control over Tesla has sparked discussions about his stake and concerns regarding potential takeovers.",
    "Finish writing the following short report: Hollywood remains unfazed by OpenAI's Sora, the latest text-to-video generator, despite its ability to quickly produce high-quality videos from simple prompts.",
    "Finish writing the following short report: Prince William's unexpected withdrawal from the memorial service for the late former King Constantine of Greece.",
    "Finish writing the following short report: As Ukraine commemorates the second anniversary of Russia's invasion, Europe confronts pressing questions about sustaining financial support for the war-torn nation."
]

ground_truths = [
    ["Donald Trump won the South Carolina Republican primary on February 24, 2024. The victory solidifies Trump's path towards securing the GOP nomination for the third consecutive time. Nikki Haley, Trump's former representative to the UN and former South Carolina governor, faced challenges despite holding more campaign events. South Carolina's first-in-the-South primary has historically been a reliable indicator of the eventual Republican nominee. Trump has won all significant contests for Republican delegates, including Iowa, New Hampshire, Nevada, and the US Virgin Islands. Haley, despite vowing to continue running, faces pressure to reconsider her candidacy after Trump's South Carolina victory. Trump's strength endures despite facing 91 criminal charges related to efforts to overturn the 2020 election and other controversies. Trump's first criminal trial is scheduled for March 25, 2024, in New York. The 2024 presidential race is increasingly shaping up as a rematch between Trump and President Joe Biden. Biden, facing criticism for supporting Israel in the Gaza War, is preparing for a tough electoral battle against Trump.Trump's age (77) and Biden's age (81) become focal points in the election discourse, with both candidates trading jabs."],
    ["Navalny died on February 16, 2024, in a Siberian prison colony while serving a 19-year sentence. Maria Pevchikh claimed that Navalny was about to be freed in a prisoner swap involving Vadim Krasikov and two US citizens. Kremlin spokesperson Dmitry Peskov denied knowledge of any such agreements or negotiations. Yulia Navalnaya accused Vladimir Putin of killing her husband and demanded the return of Navalny's body. Navalny's body was reportedly held for 'chemical analysis,' leading to suspicions of poisoning with Novichok. International leaders, including the EU and the US, held Putin responsible for Navalny's death and considered potential sanctions. Navalny's opposition activities, his previous poisoning in 2020, and the role of economic policies, particularly mass privatization, in shaping the conditions leading to his tragic death."],
    ["Israeli special forces entered Nasser Hospital in Khan Younis, Gaza, based on 'credible intelligence' that Hamas held captives from the October 7 attacks. The operation aimed to locate bodies of Israeli hostages. Prime Minister Netanyahu vowed to proceed with an invasion of Rafah. Prince William expressed concern over the conflict, emphasizing the need for humanitarian support, hostage release, and permanent peace. The report should cover these key points, incorporating relevant details and developments."],
    ["""Alaska Airlines Boeing 737 Max 9 door blowout incident.
Grounding of 171 Boeing 737 Max 9 planes in the U.S.
Boeing's acknowledgment of the mistake by Dave Calhoun.
Details about the missing 27kg door "plug" and the four crucial bolts.
Rapid loss of cabin pressure and the National Transportation Safety Board's preliminary findings.
Highlight of manufacturing defects and quality control issues at Boeing.
Mention of executive changes at Boeing.
The strain on Boeing's relationship with airlines and increased safety concerns. The missing section of the plane was retrieved from the back garden of a Portland teacher.
Alaska Airlines had placed restrictions on the aircraft following pressurization warnings before the incident.
Boeing's president and CEO, Dave Calhoun, reassured Boeing staff, promising 100% transparency in addressing the mistake.
Boeing organized a "Quality Stand Down" at its 737 factory in Renton, Washington, focusing on quality issues.
The National Safety Transportation Board (NTSB) is expected to release its preliminary findings on the Alaska Airlines 737 Max 9 incident.
Critics argue that Boeing's shift in corporate culture towards prioritizing profits over safety started after the 1997 merger with McDonnell Douglas.
Boeing's move away from its Pacific Northwest roots, the shift in leadership with financial backgrounds, and its reliance on suppliers are highlighted.
The report emphasizes the impact of cost-cutting measures on Boeing's engineering and manufacturing, leading to various safety and quality issues.
Boeing's financial struggles, including cumulative net losses of more than $26 billion over the last five years, are outlined.
A new FAA report finds major issues with Boeing's safety culture, describing it as "inadequate" and "confusing."
The report identifies gaps in Boeing's safety journey, highlighting employee confusion, inconsistent reporting channels, and a fear of retaliation.
Boeing's safety improvements since 2019, such as establishing an aerospace safety committee and a chief aerospace safety office, are mentioned."""],
    ["Elon Musk's desire for increased control over Tesla has sparked discussions about his stake and concerns regarding potential takeovers. Elon Musk, with approximately 13% ownership in Tesla, expressed a desire for increased control due to concerns about potential takeovers, particularly by 'dubious interests.' He highlighted unease regarding Tesla's investments in artificial intelligence and stated a preference for a 25% voting control. The comments led to a 'firestorm' for Tesla, impacting its self-driving and automated capabilities narrative. Additionally, ongoing legal action surrounds a 10-year pay package granted to Musk in 2018, valued at about $55 billion."],
    ["Hollywood remains unfazed by OpenAI's Sora, the latest text-to-video generator, despite its ability to quickly produce high-quality videos from simple prompts. Industry experts express concerns about potential job displacement and creative challenges, with some figures like Tyler Perry already taking action. Sora, though not broadly available yet, has garnered attention for its impressive capabilities, generating hyperrealistic videos up to 1 minute long. Sora is the latest leap in artificial intelligence software, allowing users to create staggeringly realistic videos from a simple worded prompt"],
    ["Prince William's unexpected withdrawal from the memorial service for the late former King Constantine of Greece was attrubyted to a 'personal matter'.  In the absence of Prince William, Queen Camilla led the Royal Family at St George's Chapel in Windsor Castle. Despite concerns, royal sources assured the public that there was no cause for alarm. "],
    ["As Ukraine commemorates the second anniversary of Russia's invasion, Europe confronts pressing questions about sustaining financial support for the war-torn nation. The war, deadlocked for some time, witnessed Ukraine's recent withdrawal from the key town of Avdiivka, marking its worst defeat since May. Financial aid from the United States, currently awaiting House approval, is crucial, with the EU and NATO unity showing signs of strain. Despite spending over $100 billion on Ukraine's defense, Europe is grappling with the challenge of political fatigue and the potential impact of upcoming elections. "]
    ]

In [None]:
def dictionary(result):
    dict_result = dict(result)
    average_score = sum(dict_result.values()) / len(dict_result)
    print(f"The average score is: {average_score}")
    return average_score

### General answer by LLMs

In [7]:
template = """{question}"""
prompt = ChatPromptTemplate.from_template(template)

In [8]:
llm_chain =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm}
)
llm_chain_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm_gpt4}
)

In [9]:
answers_llm = []
contexts_llm = [[""],[""],[""],[""],[""],[""],[""],[""],[""],[""]]

In [10]:
for query in questions:
    response = llm_chain.invoke({"question": query})
    answers_llm.append(response["response"].content)

In [11]:
llm_results = evaluation_llm(questions, answers_llm, contexts_llm, ground_truths)
print(llm_results)
dict_llm_results = dictionary(llm_results)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 40/40 [00:10<00:00,  3.89it/s]


{'faithfulness': 0.9630, 'answer_relevancy': 0.4663, 'answer_similarity': 0.9008, 'answer_correctness': 0.4650}


In [13]:
answers_llm_gpt4 = []
for query in questions:
    response = llm_chain_gpt4.invoke({"question": query})
    answers_llm_gpt4.append(response["response"].content)
llm_results_gpt4 = evaluation_llm(questions, answers_llm_gpt4, contexts_llm, ground_truths)
print(llm_results_gpt4)
dict_llm_results_gpt4 = dictionary(llm_results_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 40/40 [00:14<00:00,  2.71it/s]


{'faithfulness': 0.9167, 'answer_relevancy': 0.2835, 'answer_similarity': 0.9067, 'answer_correctness': 0.5423}


# RAG

### Naive RAG

In [13]:
def retrieval_chain(prompt, retriever, llm):
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
    )
    return retrieval_augmented_qa_chain

In [25]:
loader = DirectoryLoader('../news', glob="./*.txt", loader_cls=TextLoader)
documents = loader.load()

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = CharacterTextSplitter()
chunks = text_splitter.split_documents(documents)
db_naive = Chroma.from_documents(chunks, embeddings_client, persist_directory = "../news/vectordb/naive")
db_naive.persist()
retriever_naive = db_naive.as_retriever()

In [19]:
db_naive = Chroma(persist_directory = "../news/vectordb/naive", embedding_function=embeddings_client)
retriever_naive = db_naive.as_retriever()

In [7]:
template = """{question}. 
Provided context {context}."""
prompt = ChatPromptTemplate.from_template(template)

In [21]:
answers_naive = []
contexts_naive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_naive, llm).invoke({"question": query})
        # Access the response content
        answers_naive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_naive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_naive.append("No answer")
        context_full = retriever_naive.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_naive.append(context_content)

In [22]:
result_naive_rag = evaluation_rag(questions, answers_naive, contexts_naive, ground_truths)
print(result_naive_rag)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.94it/s]


{'faithfulness': 0.9187, 'answer_relevancy': 0.8303, 'context_precision': 0.9667, 'context_recall': 0.9600, 'answer_similarity': 0.9630, 'answer_correctness': 0.7468}


### recursive splitter

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
chunks_r = text_splitter.split_documents(documents)
db_basic = Chroma.from_documents(chunks_r, embeddings_client, persist_directory = "../news/vectordb/recursive_basic")
db_basic.persist()
retriever_basic = db_basic.as_retriever()

In [23]:
db_basic = Chroma(persist_directory = "../news/vectordb/recursive_basic", embedding_function=embeddings_client)
retriever_basic = db_basic.as_retriever()

In [24]:
answers_recursive = []
contexts_recursive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_basic, llm).invoke({"question": query})
        # Access the response content
        answers_recursive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_recursive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_recursive.append("No answer")
        context_full = retriever_basic.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_recursive.append(context_content)

In [25]:
result_recursive = evaluation_rag(questions, answers_recursive, contexts_recursive, ground_truths)
print(result_recursive)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.76it/s]


{'faithfulness': 0.9524, 'answer_relevancy': 0.8284, 'context_precision': 0.9750, 'context_recall': 0.9800, 'answer_similarity': 0.9634, 'answer_correctness': 0.7330}


### chunk sizes

In [11]:
def run_and_evaluate(retriever, prompt, llm):
    answers_recursive = []
    contexts_recursive = []

    for query in questions:
        try:  
            response = retrieval_chain(prompt, retriever, llm).invoke({"question": query})
            # Access the response content
            answers_recursive.append(response["response"].content)
            # Access the context content
            context_content = [context.page_content for context in response["context"]]
            contexts_recursive.append(context_content)  
        except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_recursive.append("No answer")
            context_full = retriever.get_relevant_documents(query)
            context_content = [context.page_content for context in context_full]
            contexts_recursive.append(context_content)


    result = evaluation_rag(questions, answers_recursive, contexts_recursive, ground_truths)
    return result

chunk = 1000

In [None]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
chunks_1000 = text_splitter.split_documents(documents)
print(len(chunks_1000))
db_1000 = Chroma.from_documents(chunks_1000, embeddings_client, persist_directory = "../news/vectordb/recursive_1000")
db_1000.persist()

In [27]:
db_1000 = Chroma(persist_directory = "../news/vectordb/recursive_1000", embedding_function=embeddings_client)

In [28]:
retriever_1000 = db_1000.as_retriever()
result_1000 = run_and_evaluate(retriever_1000, prompt, llm)
print("CHUNK SIZE 1000")
print(result_1000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.18it/s]


CHUNK SIZE 1000
{'faithfulness': 0.9841, 'answer_relevancy': 0.8044, 'context_precision': 0.9750, 'context_recall': 0.9550, 'answer_similarity': 0.9614, 'answer_correctness': 0.6946}


chunk = 500

In [33]:
# THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
chunks_500 = text_splitter.split_documents(documents)
print(len(chunks_500))
db_500 = Chroma.from_documents(chunks_500, embeddings_client, persist_directory = "../news/vectordb/recursive_500")
db_500.persist()

107


In [None]:
db_500 = Chroma(persist_directory = "../news/vectordb/recursive_500", embedding_function=embeddings_client)

In [34]:
retriever_500 = db_500.as_retriever()
result_500 = run_and_evaluate(retriever_500, prompt, llm)
print("CHUNK SIZE 500")
print(result_500)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.89it/s]


CHUNK SIZE 500
{'faithfulness': 0.9378, 'answer_relevancy': 0.7268, 'context_precision': 0.9500, 'context_recall': 0.9472, 'answer_similarity': 0.9616, 'answer_correctness': 0.7098}


chunk = 2000

In [35]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 200)
chunks_2000 = text_splitter.split_documents(documents)
print(len(chunks_2000))
db_2000 = Chroma.from_documents(chunks_2000, embeddings_client, persist_directory = "../news/vectordb/recursive_2000")


37


In [8]:
db_2000 = Chroma(persist_directory = "../news/vectordb/recursive_2000", embedding_function=embeddings_client)

In [37]:
retriever_2000 = db_2000.as_retriever()
result_2000 = run_and_evaluate(retriever_2000, prompt, llm)
print("CHUNK SIZE 2000")
print(result_2000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.28it/s]


CHUNK SIZE 2000
{'faithfulness': 0.9435, 'answer_relevancy': 0.8731, 'context_precision': 0.9750, 'context_recall': 1.0000, 'answer_similarity': 0.9636, 'answer_correctness': 0.7390}


chunk = 3000

In [38]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 3000, chunk_overlap = 300)
chunks_3000 = text_splitter.split_documents(documents)
print(len(chunks_3000))
db_3000 = Chroma.from_documents(chunks_3000, embeddings_client, persist_directory = "../news/vectordb/recursive_3000")

32


In [None]:
db_3000 = Chroma(persist_directory = "../news/vectordb/recursive_3000", embedding_function=embeddings_client)

In [39]:
retriever_3000 = db_3000.as_retriever()
result_3000 = run_and_evaluate(retriever_3000, prompt, llm)
print("CHUNK SIZE 3000")
print(result_3000)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.30it/s]


CHUNK SIZE 3000
{'faithfulness': 0.9592, 'answer_relevancy': 0.7520, 'context_precision': 0.9583, 'context_recall': 0.9800, 'answer_similarity': 0.9645, 'answer_correctness': 0.7113}


### now time to look for different top-k

Note: We continue with the size chunk of 2000 as it had the highest average score

In [9]:
retriever_2000 = db_2000.as_retriever()

In [14]:
retriever_2 = db_2000.as_retriever(search_kwargs={"k": 2})
result_2 = run_and_evaluate(retriever_2, prompt, llm)
print("CHUNK SIZE 1000, K=2")
print(result_2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.54it/s]


CHUNK SIZE 1000, K=2
{'faithfulness': 0.9306, 'answer_relevancy': 0.9134, 'context_precision': 1.0000, 'context_recall': 0.9289, 'answer_similarity': 0.9617, 'answer_correctness': 0.7467}


In [15]:
retriever_3 = db_2000.as_retriever(search_kwargs={"k": 3})
result_3 = run_and_evaluate(retriever_3, prompt, llm)
print("CHUNK SIZE 1000, K=3")
print(result_3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.10it/s]


CHUNK SIZE 1000, K=3
{'faithfulness': 0.9260, 'answer_relevancy': 0.8950, 'context_precision': 0.9833, 'context_recall': 0.9750, 'answer_similarity': 0.9641, 'answer_correctness': 0.7973}


In [16]:
retriever_5 = db_2000.as_retriever(search_kwargs={"k": 5})
result_5 = run_and_evaluate(retriever_5, prompt, llm)
print("CHUNK SIZE 1000, K=5")
print(result_5)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.76it/s]


CHUNK SIZE 1000, K=5
{'faithfulness': 0.9688, 'answer_relevancy': 0.7442, 'context_precision': 0.9617, 'context_recall': 0.9714, 'answer_similarity': 0.9619, 'answer_correctness': 0.7390}


In [17]:
retriever_6= db_2000.as_retriever(search_kwargs={"k": 6})
result_6 = run_and_evaluate(retriever_6, prompt, llm)
print("CHUNK SIZE 1000, K=5")
print(result_6)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  63%|██████▎   | 38/60 [00:06<00:02,  9.86it/s]Invalid JSON response. Expected dictionary with key 'Attributed'
Evaluating:  97%|█████████▋| 58/60 [00:13<00:01,  1.38it/s]Runner in Executor raised an exception
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 58, in _aresults
    r = await future
        ^^^^^^^^^^^^
  File "C:\Users\sigitalapina\AppData\Local\Programs\Python\Python311\Lib\asyncio\tasks.py", line 605, in _wait_for_one
    return f.result()  # May raise f.exception().
           ^^^^^^^^^^
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 91, in wrapped_callable_async
    retu

CHUNK SIZE 1000, K=5
{'faithfulness': 0.7835, 'answer_relevancy': 0.8474, 'context_precision': 0.8161, 'context_recall': 0.9210, 'answer_similarity': 0.9402, 'answer_correctness': 0.8747}


In [23]:
dict_res6 = dict(result_6)
average_score = sum(dict_res6.values()) / len(dict_res6)
print(f"The average score is: {average_score}")

The average score is: 0.8638028044818998


### look for different retrievers

3 chunks was the best score

#### parent document retriever

In [26]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

parent_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents",persist_directory = "../news/vectordb/parent", embedding_function=embeddings_client)
vectorstore.persist()
store = InMemoryStore()
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
parent_document_retriever.add_documents(documents)
result_parent = run_and_evaluate(parent_document_retriever, prompt, llm)
print(result_parent)
dict_result_parent = dict(result_parent)
average_score_parent = sum(dict_result_parent.values()) / len(dict_result_parent)
print(f"The average score is: {average_score_parent}")

TypeError: run_and_evaluate() missing 2 required positional arguments: 'prompt' and 'llm'

In [27]:
result_parent = run_and_evaluate(parent_document_retriever, prompt, llm)
print(result_parent)
dict_result_parent = dict(result_parent)
average_score_parent = sum(dict_result_parent.values()) / len(dict_result_parent)
print(f"The average score is: {average_score_parent}")

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.99it/s]


{'faithfulness': 0.9738, 'answer_relevancy': 0.9057, 'context_precision': 0.9833, 'context_recall': 0.9489, 'answer_similarity': 0.9621, 'answer_correctness': 0.7362}
The average score is: 0.9183376088705733


In [28]:
parent_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap = 50)
child_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_small",persist_directory = "../news/vectordb/parent_small", embedding_function=embeddings_client)
vectorstore.persist()
store = InMemoryStore()
parent_document_retriever_small = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_small,
    parent_splitter=parent_splitter_small,
)
parent_document_retriever_small.add_documents(documents)
result_parent_small = run_and_evaluate(parent_document_retriever_small, prompt, llm)
print(result_parent_small)
dict_result_parent_small = dict(result_parent_small)
average_score_parent_small = sum(dict_result_parent_small.values()) / len(dict_result_parent_small)
print(f"The average score is: {average_score_parent_small}")

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.75it/s]


{'faithfulness': 0.9844, 'answer_relevancy': 0.7338, 'context_precision': 0.9417, 'context_recall': 0.9378, 'answer_similarity': 0.9615, 'answer_correctness': 0.6669}
The average score is: 0.871012803992867


In [29]:
parent_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=3000)
child_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=300, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_large",persist_directory = "../news/vectordb/parent_large", embedding_function=embeddings_client)
vectorstore.persist()
store = InMemoryStore()
parent_document_retriever_large = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_large,
    parent_splitter=parent_splitter_large,
)
parent_document_retriever_large.add_documents(documents)
result_parent_large = run_and_evaluate(parent_document_retriever_large, prompt, llm)
print(result_parent_large)
dict_result_parent_large = dict(result_parent_large)
average_score_parent_large = sum(dict_result_parent_large.values()) / len(dict_result_parent_large)
print(f"The average score is: {average_score_parent_large}")

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.10it/s]


{'faithfulness': 0.9738, 'answer_relevancy': 0.9087, 'context_precision': 0.9833, 'context_recall': 0.9639, 'answer_similarity': 0.9642, 'answer_correctness': 0.6708}
The average score is: 0.9107785434064702


#### Maximum marginal relevance retrieval

In [32]:
retriever_mmr = db_2000.as_retriever(search_type="mmr",search_kwargs={"k": 3})
result_mmr = run_and_evaluate(retriever_mmr, prompt, llm)
print("Marginal relevance")
print(result_mmr)
average_score_mmr = dictionary(result_mmr)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.59it/s]


Marginal relevance
{'faithfulness': 0.9643, 'answer_relevancy': 0.7875, 'context_precision': 1.0000, 'context_recall': 0.9078, 'answer_similarity': 0.9574, 'answer_correctness': 0.6069}
The average score is: 0.8706369232635726


#### BM25

In [34]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 200)
chunks_2000 = text_splitter.split_documents(documents)

In [35]:
retriever_bm25 = BM25Retriever.from_documents(chunks_2000)
result_bm25 = run_and_evaluate(retriever_bm25, prompt, llm)
print("BM25")
print(result_bm25)
average_score_bm25 = dictionary(result_bm25)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-625' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_transports\default.py", line 385, in aclose
    await self._pool.aclose()
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packa

BM25
{'faithfulness': 0.9708, 'answer_relevancy': 0.7770, 'context_precision': 0.9333, 'context_recall': 0.9750, 'answer_similarity': 0.9483, 'answer_correctness': 0.6228}
The average score is: 0.8712076560157209


#### Ensambler - Hybrid

In [38]:
ensemble_retriever_1 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_3], weights=[0.75, 0.25])
result_ensemble1 = run_and_evaluate(ensemble_retriever_1, prompt, llm)
print("Ensambler 75/25")
print(result_ensemble1)
average_score_ensambler1 = dictionary(result_ensemble1)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:18<00:00,  3.23it/s]


Ensambler
{'faithfulness': 0.9607, 'answer_relevancy': 0.8681, 'context_precision': 0.9617, 'context_recall': 0.9500, 'answer_similarity': 0.9604, 'answer_correctness': 0.6588}
The average score is: 0.8932842741994689


In [39]:
ensemble_retriever_2 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_3], weights=[0.5, 0.5])
result_ensemble2 = run_and_evaluate(ensemble_retriever_2, prompt, llm)
print("Ensambler 50/50")
print(result_ensemble2)
avg_result_ensemble2 = dictionary(result_ensemble2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.84it/s]


Ensambler 50/50
{'faithfulness': 0.9841, 'answer_relevancy': 0.8430, 'context_precision': 0.9667, 'context_recall': 0.9500, 'answer_similarity': 0.9532, 'answer_correctness': 0.7541}
The average score is: 0.9085234951728957


In [40]:
ensemble_retriever_3 = EnsembleRetriever(retrievers=[retriever_bm25, retriever_3], weights=[0.25,0.75])
result_ensemble3 = run_and_evaluate(ensemble_retriever_3, prompt, llm)
print("Ensambler 25/75")
print(result_ensemble3)
avg_result_ensemble3 = dictionary(result_ensemble3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  28%|██▊       | 17/60 [00:05<00:10,  4.13it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating:  38%|███▊      | 23/60 [00:06<00:09,  3.78it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.90it/s]


Ensambler 25/75
{'faithfulness': 1.0000, 'answer_relevancy': 0.7409, 'context_precision': 0.9583, 'context_recall': 0.9214, 'answer_similarity': 0.9612, 'answer_correctness': 0.7724}
The average score is: 0.8923824405244444


#### Multi-stage - reranker

In [42]:
os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [45]:
retriever_context = retriever_2000
compressor = CohereRerank(top_n = 3)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever_context
)

result_compression = run_and_evaluate(compression_retriever, prompt, llm)
print("Reranker")
print(result_compression)
avg_result_compression = dictionary(result_compression)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.99it/s]


Reranker
{'faithfulness': 0.8968, 'answer_relevancy': 0.8880, 'context_precision': 0.9417, 'context_recall': 0.9750, 'answer_similarity': 0.9576, 'answer_correctness': 0.6623}
The average score is: 0.8868849276357208


#### creating context by remaking the query

In [46]:
template_context = "Generate a search query to fetch the relevant documents using the user's {question}. Craft a query that specifically targets the keywords in the question. In the answer provide only the query."
prompt_context = ChatPromptTemplate.from_template(template_context)

In [49]:
answers_final = []
contexts_final = []
llm_for_context =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm}
)
for query in questions:
    response_check = llm_for_context.invoke({"question": query})
    search_query = response_check["response"].content
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
)
    docs = retriever_3.get_relevant_documents(search_query)
    formatted_docs = []
    for doc in docs:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final.append(response["response"].content)
            contexts_final.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final.append("No answer")
            contexts_final.append(formatted_docs)


result_search_query = evaluation_rag(questions, answers_final, contexts_final, ground_truths)
print(result_search_query)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.95it/s]


{'faithfulness': 0.9815, 'answer_relevancy': 0.8995, 'context_precision': 0.9833, 'context_recall': 0.9750, 'answer_similarity': 0.9605, 'answer_correctness': 0.7531}


In [50]:
print(result_search_query)
avg_result_search_query = dictionary(result_search_query)

{'faithfulness': 0.9815, 'answer_relevancy': 0.8995, 'context_precision': 0.9833, 'context_recall': 0.9750, 'answer_similarity': 0.9605, 'answer_correctness': 0.7531}
The average score is: 0.9254703293008256


### change model to GPT-4

In [53]:
answers_final_gpt4 = []
contexts_final_gpt4 = []
llm_for_context_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm_gpt4}
)
for query in questions:
    response_check_gpt4 = llm_for_context_gpt4.invoke({"question": query})
    search_query_gpt4 = response_check_gpt4["response"].content
    retrieval_augmented_qa_chain_gpt4 = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm_gpt4, "context": itemgetter("context")}
)
    docs_gpt4 = retriever_3.get_relevant_documents(search_query_gpt4)
    formatted_docs = []
    for doc in docs_gpt4:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain_gpt4.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final_gpt4.append(response["response"].content)
            contexts_final_gpt4.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final_gpt4.append("No answer")
            contexts_final_gpt4.append(formatted_docs)


result_search_query_gpt4 = evaluation_rag(questions, answers_final_gpt4, contexts_final_gpt4, ground_truths)
print(result_search_query_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:15<00:00,  3.95it/s]


{'faithfulness': 0.9907, 'answer_relevancy': 0.8982, 'context_precision': 0.9833, 'context_recall': 0.9750, 'answer_similarity': 0.9634, 'answer_correctness': 0.7417}


In [54]:
avg_result_search_query_gpt4 = dictionary(result_search_query_gpt4)

The average score is: 0.9254145421692223
