In [1]:
!pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain ragas datasets cohere

In [21]:
import dotenv
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings
dotenv.load_dotenv()
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness,
)
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
import os
import pandas as pd
import numpy as np
import sys
sys.tracebacklimit = 0

In [3]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL")
EMBEDDING_DEPLOYMENT = os.environ.get("EMBEDDING_DEPLOYMENT")
OPENAI_MODEL_GPT4 = os.environ.get("OPENAI_MODEL_GPT4")
OPENAI_DEPLOYMENT_GPT4 = os.environ.get("OPENAI_DEPLOYMENT_GPT4")

# Evaluation part

### Ground truth

In [68]:
questions = [
    "Who was Alexei Navalny supposed to be exchanged for in the planned prisoner swap, according to his colleague Maria Pevchikh?",
    "Why did the door blow out of Boeing 737 Max shortly after take-off?",
    "When did it mark 2 years since Russia's envasion in Ukraine?",
    "What is Sora AI, how does it work, and when can the general public expect to use it?",
    "What are the potential key issues that could impact the 2024 U.S. presidential election between Joe Biden and Donald Trump, and how might they impact the outcome?",
    "Why does Elon Musk want a bigger stake in Tesla, and what concerns does he express about the current structure of the company?",
    "What are the shopping habits of Gen Alpha, and why are mature brands interested in targeting this demographic?",
    "What happened during the Hamas attacks on Israel in 2023?",
    "On January 15, 2024, how did Kim Jong Un call South Korea?",
    "Why did Prince William pull out of the godfather's memorial service on 27.02.2024?"
]

ground_truths = [
    ["Alexei Navalny was supposed to be exchanged for Vadim Krasikov, a Russian hitman serving a life sentence for murder in Germany, as claimed by Maria Pevchikh."],  
    ["Bolts were missing."],
    ["On February 24, 2024."],
    ["Sora AI is an artificial intelligence tool developed by OpenAI that can generate realistic videos up to 1 minute long based on textual prompts. It utilizes diffusion models, a method used in AI image generators, to create videos by understanding the association between images and accompanying alt text. As of now, Sora is not available to the general public; OpenAI is following a cautious approach by first testing it with a small group of 'red teamers' for potential harm or risks, followed by availability to visual artists, designers, and filmmakers before a potential public release, likely under a pay-to-use model similar to GPT. While Sora has shown significant advancements in AI video generation compared to previous attempts, it is not perfect, with occasional flaws and limitations seen in hand-picked videos released by OpenAI."],
    ["The potential key issues that could impact the 2024 U.S. presidential election between Joe Biden and Donald Trump include contrasting economic views, with Americans historically voting based on their economic well-being; the contentious topics of abortion and immigration, with Democrats focusing on abortion rights and Republicans emphasizing border security; and external factors like the Gaza War and its impact on Biden's support base. Other factors include crime rates, environmental concerns, and foreign policy. Additionally, the health and age of both candidates could be scrutinized, and the potential emergence of third-party candidates or independents might introduce unpredictability. Trump's legal challenges, facing 91 charges and four criminal trials, could influence public opinion, and the specter of the January 2021 Capitol attack may resonate differently among Republican and Democratic voters. Overall, these factors contribute to the complexity and uncertainty of the upcoming presidential race."],
    ["Elon Musk wants a bigger stake in Tesla to gain more control over the company's direction, especially regarding its investments in artificial intelligence (AI) features. He is concerned about Tesla's vulnerability to a 'takeover by dubious interests' and aims to have 25% voting control to ensure the company's leadership in AI and robotics. Musk suggests that without this control, he would prefer to develop products outside of Tesla."],
    ["Gen Alpha, born between 2010 and 2024, prefers to shop at adult brands like Lululemon, Sephora, Walmart, and Target, following the trends of their millennial parents. Adult brands are interested in targeting Gen Alpha due to their economic potential, with an estimated economic footprint of $5.46 trillion by 2029. These brands can secure the loyalty of the next generation by expanding their offerings to cater to the specific needs of Gen Alpha."],
    ["On the morning of 7 October, waves of Hamas gunmen stormed across Gaza's border into Israel, killing about 1,200 people. Hamas also fired thousands of rockets. Those killed included children, the elderly and 364 young people at a music festival.  Hamas took more than 250 others to Gaza as hostages."],
    ["Kim Jong Un  'principal enemy.' He intensified nuclear threats, conducted missile tests, and abolished government agencies promoting cooperation and reunification. Analysts, including Robert L. Carlin and Siegfried Hecker, suggest he may be preparing for a military attack on South Korea."],
    ["Due to 'personal matter'."]
]
answers_llm = []
contexts_llm = [[""],[""],[""],[""],[""],[""],[""],[""],[""],[""]]

In [5]:
embeddings_client = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_version=OPENAI_API_VERSION)
llm = AzureChatOpenAI(model_name=OPENAI_MODEL, azure_deployment=OPENAI_DEPLOYMENT,temperature=0)
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [6]:
def evaluation_llm(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
    )
    return result

In [7]:
def evaluation_rag(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  # most likely
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

In [8]:
columns = ["System", "Faithfulness", "Answer Relevancy", "Context Precision", "Context Recall", "Answer Similarity", "Answer Correctness"]
results_df = pd.DataFrame(columns=columns)

In [9]:
max_average = 0
def find_highest(average_score):
    global max_average
    if average_score > max_average:
        max_average = average_score
        print("This is the new best value!")

In [10]:
def dictionary(result):
    dict_result = dict(result)
    average_score = sum(dict_result.values()) / len(dict_result)
    print(f"The average score is: {average_score}")
    find_highest(average_score)
    return average_score

In [11]:
def evaluate_system(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_rag(questions, answers, contexts, ground_truths)
    average = dictionary(result)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Faithfulness": result["faithfulness"],
        "Answer Relevancy": result["answer_relevancy"],
        "Context Precision": result["context_precision"],
        "Context Recall": result["context_recall"],
        "Answer Similarity": result["answer_similarity"],
        "Answer Correctness": result["answer_correctness"],
        "Average": average
    }
    df_system_results = pd.DataFrame([system_results])
    return df_system_results

In [12]:
def evaluate_LLM(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_rag(questions, answers, contexts, ground_truths)
    average = dictionary(result)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Faithfulness": result["faithfulness"],
        "Answer Relevancy": result["answer_relevancy"],
        "Context Precision": np.nan,
        "Context Recall": np.nan,
        "Answer Similarity": result["answer_similarity"],
        "Answer Correctness": result["answer_correctness"],
        "Average": average
    }
    df_llm_results = pd.DataFrame([system_results])
    return df_llm_results

### General answers by LLM

In [14]:
template = """{question}"""
prompt = ChatPromptTemplate.from_template(template)

In [15]:
llm_chain =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm}
)
llm_chain_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm_gpt4}
)

In [16]:
for query in questions:
    response = llm_chain.invoke({"question": query})
    answers_llm.append(response["response"].content)

In [17]:
llm_results = evaluate_LLM("GPT-3.5", questions, answers_llm, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results], ignore_index=True)
print(llm_results)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.88it/s]


The average score is: 0.6253870004200923
This is the new best value!
    System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  GPT-3.5           1.0          0.375595                NaN             NaN   

   Answer Similarity  Answer Correctness   Average  
0           0.847668             0.45406  0.625387  


  results_df = pd.concat([results_df, llm_results], ignore_index=True)


In [21]:
answers_llm_gpt4 = []
for query in questions:
    response = llm_chain_gpt4.invoke({"question": query})
    answers_llm_gpt4.append(response["response"].content)
llm_results_gpt4 = evaluate_LLM("GPT-4",questions, answers_llm_gpt4, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results_gpt4], ignore_index=True)
print(llm_results_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:07<00:00,  7.97it/s]


The average score is: 0.5506575325401026
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  GPT-4          0.75          0.360955                NaN             NaN   

   Answer Similarity  Answer Correctness   Average  
0           0.835999            0.415325  0.550658  


In [22]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.375595,,,0.847668,0.45406,0.625387
1,GPT-4,0.75,0.360955,,,0.835999,0.415325,0.550658


### Naive RAG

In [13]:
template = """User input {question}. 
Context {context}."""
prompt = ChatPromptTemplate.from_template(template)

In [14]:
def retrieval_chain(prompt, retriever, llm):
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
    )
    return retrieval_augmented_qa_chain

In [15]:
loader = DirectoryLoader('../news', glob="./*.txt", loader_cls=TextLoader)
documents = loader.load()

In [None]:
text_splitter = CharacterTextSplitter()
chunks = text_splitter.split_documents(documents)
db_naive = Chroma.from_documents(chunks, embeddings_client, persist_directory = "../vectordb/naive")
retriever_naive = db_naive.as_retriever()

In [26]:
db_naive = Chroma(persist_directory = "../vectordb/naive", embedding_function=embeddings_client)
retriever_naive = db_naive.as_retriever()

In [27]:
answers_naive = []
contexts_naive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_naive, llm).invoke({"question": query})
        # Access the response content
        answers_naive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_naive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_naive.append("No answer")
        context_full = retriever_naive.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_naive.append(context_content)

In [28]:
result_naive_rag = evaluate_system("Naive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_naive_rag], ignore_index=True)
print(result_naive_rag)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  23%|██▎       | 14/60 [00:05<00:16,  2.85it/s]Task exception was never retrieved
future: <Task finished name='Task-186' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py", line 2018, in aclose
    await self._transport.aclose()
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_transports\default.py", line 385, in aclose
    await self._pool.aclose()
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\s

The average score is: 0.8433780682264684
This is the new best value!
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  Naive      0.904762          0.773143                0.9            0.98   

   Answer Similarity  Answer Correctness   Average  
0           0.902388            0.599976  0.843378  


In [17]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.375595,,,0.847668,0.45406,0.625387
1,GPT-4,0.75,0.360955,,,0.835999,0.415325,0.550658
2,Naive,0.904762,0.773143,0.9,0.98,0.902388,0.599976,0.843378
3,Recursive,0.916667,0.772477,0.9,0.98,0.902388,0.643756,0.852548


## try recursive text splitter

In [19]:
text_splitter = text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
chunks_r = text_splitter.split_documents(documents)
db_basic = Chroma.from_documents(chunks_r, embeddings_client, persist_directory = "../vectordb/recursive_basic")
retriever_basic = db_basic.as_retriever()

In [29]:
db_basic = Chroma(persist_directory = "../vectordb/recursive_basic", embedding_function=embeddings_client)
retriever_basic = db_basic.as_retriever()

In [30]:
answers_recursive = []
contexts_recursive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_basic, llm).invoke({"question": query})
        # Access the response content
        answers_recursive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_recursive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_recursive.append("No answer")
        context_full = retriever_basic.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_recursive.append(context_content)



In [31]:
result_recursive = evaluate_system("Recursive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_recursive], ignore_index=True)
result_recursive

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.18it/s]


The average score is: 0.8525479235764283
This is the new best value!


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,Recursive,0.916667,0.772477,0.9,0.98,0.902388,0.643756,0.852548


## chunk size change

In [23]:
def run_and_evaluate(name, retriever, prompt, llm, results_df):
    answers = []
    contexts_extra = []

    for query in questions:
        try:  
            response = retrieval_chain(prompt, retriever, llm).invoke({"question": query})
            # Access the response content
            answers.append(response["response"].content)
            # Access the context content
            context_content = [context.page_content for context in response["context"]]
            contexts_extra.append(context_content)  
        except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers.append("No answer")
            context_full = retriever.get_relevant_documents(query)
            context_content = [context.page_content for context in context_full]
            contexts_extra.append(context_content)

    result = evaluate_system(name, questions, answers, contexts_extra, ground_truths)
    results_df = pd.concat([results_df, result], ignore_index=True)
    return result, results_df

In [35]:
# # THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 0)
# chunks_1000 = text_splitter.split_documents(documents)
# print(len(chunks_1000))
# db_1000 = Chroma.from_documents(chunks_1000, embeddings_client, persist_directory = "../news/vectordb-edit/recursive_1000")
# db_1000.persist()

55


In [None]:
# db_1000 = Chroma(persist_directory = "../news/vectordb-edit/recursive_1000", embedding_function=embeddings_client)

In [39]:
# retriever_1000 = db_1000.as_retriever()
# result_1000_0 = run_and_evaluate("Chunk 1000, overlap 0%",retriever_1000, prompt, llm, results_df)
# print("CHUNK SIZE 1000, 0% overlap")
# result_1000_0

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.55it/s]


The average score is: 0.7505457735075017
This is the new best value!
CHUNK SIZE 1000, 0% overlap


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk 1000, overlap 0%",0.716667,0.599047,0.9,0.98,0.847668,0.459893,0.750546


In [41]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.375595,,,0.847668,0.45406,0.625387
1,GPT-4,0.75,0.360955,,,0.835999,0.415325,0.550658
2,Naive,0.904762,0.773143,0.9,0.98,0.902388,0.599976,0.843378
3,Recursive,0.916667,0.772477,0.9,0.98,0.902388,0.643756,0.852548


In [36]:
results_df.to_csv(f"../news/results/results.csv", index=False)

In [27]:
chunk_sizes = [500, 1000, 2000, 3000]
overlap_percentages = [0, 5, 10, 15, 20]

for chunk_size in chunk_sizes:
    for overlap_percentage in overlap_percentages:
        # Calculate overlap based on percentage
        chunk_overlap = int(chunk_size * overlap_percentage / 100)
        
        # Create text splitter
        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        
        # Split documents
        chunks = text_splitter.split_documents(documents)
        
        # Print number of chunks
        print(f"Number of chunks for chunk size {chunk_size}, overlap {overlap_percentage}%: {len(chunks)}")
        
        # Create Chroma database
        db = Chroma.from_documents(chunks, embeddings_client, persist_directory=f"../news/vectordb-edit/chunking_{chunk_size}_{overlap_percentage}")
        db.persist()
        
        # Create retriever
        retriever = db.as_retriever()
        
        # Run and evaluate
        result,results_df = run_and_evaluate(f"Chunk {chunk_size}, overlap {overlap_percentage}%", retriever, prompt, llm, results_df)
        print(f"CHUNK SIZE {chunk_size}, {overlap_percentage}% overlap:")
        print(result)

Number of chunks for chunk size 500, overlap 0%: 102


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.90it/s]


The average score is: 0.8372452594212253
CHUNK SIZE 500, 0% overlap:
                  System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 0%           1.0          0.837908                0.8   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             0.9            0.92037            0.565194  0.837245  
Number of chunks for chunk size 500, overlap 5%: 102


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.83it/s]


The average score is: 0.8221936357202062
CHUNK SIZE 500, 5% overlap:
                  System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 5%         0.875          0.741242           0.847222   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0           0.925           0.919578            0.625119  0.822194  
Number of chunks for chunk size 500, overlap 10%: 107


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-512' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-513' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-514' coro=<AsyncClient.aclose() done, defined

The average score is: 0.829080551572552
CHUNK SIZE 500, 10% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 10%         0.875          0.744905               0.85   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.95           0.917581            0.636998  0.829081  
Number of chunks for chunk size 500, overlap 15%: 113


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:13<00:00,  4.44it/s]


The average score is: 0.8097132090775329
CHUNK SIZE 500, 15% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 15%      0.888889          0.658863                0.9   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             0.9           0.912248             0.59828  0.809713  
Number of chunks for chunk size 500, overlap 20%: 115


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.73it/s]


The average score is: 0.8507321083714663
CHUNK SIZE 500, 20% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 20%      0.907143           0.81964           0.891667   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0          0.9375            0.91602            0.632423  0.850732  
Number of chunks for chunk size 1000, overlap 0%: 55


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.64it/s]


The average score is: 0.8645487695803418
This is the new best value!
CHUNK SIZE 1000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 0%      0.944444          0.797138              0.925   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.98           0.917733            0.622977  0.864549  
Number of chunks for chunk size 1000, overlap 5%: 55


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.93it/s]


The average score is: 0.849747933599466
CHUNK SIZE 1000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 5%      0.982143          0.702075              0.925   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.98           0.918439             0.59083  0.849748  
Number of chunks for chunk size 1000, overlap 10%: 56


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.01it/s]


The average score is: 0.878546090006025
This is the new best value!
CHUNK SIZE 1000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 10%           1.0          0.781964           0.933333   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.98            0.92265            0.653329  0.878546  
Number of chunks for chunk size 1000, overlap 15%: 58


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.21it/s]


The average score is: 0.8503212940324234
CHUNK SIZE 1000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 15%      0.982143          0.697939                0.9   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.98           0.920154            0.621692  0.850321  
Number of chunks for chunk size 1000, overlap 20%: 58


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.44it/s]


The average score is: 0.8804388538467025
This is the new best value!
CHUNK SIZE 1000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 20%           1.0          0.889265           0.883333   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.98            0.92067            0.609364  0.880439  
Number of chunks for chunk size 2000, overlap 0%: 37


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  27%|██▋       | 16/60 [00:04<00:14,  2.96it/s]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.59it/s]


The average score is: 0.8282927393085529
CHUNK SIZE 2000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 0%      0.758913          0.825006           0.917361   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0           0.825           0.862384            0.781092  0.828293  
Number of chunks for chunk size 2000, overlap 5%: 37


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.63it/s]


The average score is: 0.8156004343654494
CHUNK SIZE 2000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 5%           1.0          0.732886               0.95   

   Context Recall  Answer Similarity  Answer Correctness  Average  
0           0.775           0.887796             0.54792   0.8156  
Number of chunks for chunk size 2000, overlap 10%: 37


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.72it/s]


The average score is: 0.8418491446511309
CHUNK SIZE 2000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 10%           1.0          0.832789               0.95   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.78           0.910568            0.577737  0.841849  
Number of chunks for chunk size 2000, overlap 15%: 37


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.42it/s]


The average score is: 0.8346051315931802
CHUNK SIZE 2000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 15%      0.958333          0.839517               0.95   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0           0.775           0.910192            0.574588  0.834605  
Number of chunks for chunk size 2000, overlap 20%: 37


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  82%|████████▏ | 49/60 [00:06<00:00, 13.88it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.77it/s]


The average score is: 0.8195863903760162
CHUNK SIZE 2000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 20%           1.0          0.734692           0.933333   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.78           0.894042            0.575451  0.819586  
Number of chunks for chunk size 3000, overlap 0%: 32


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   2%|▏         | 1/60 [00:02<02:14,  2.28s/it]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8258 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Evaluating:  97%|█████████▋| 58/60 [00:21<00:00,  5.14it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8445 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_

The average score is: 0.8325463613991283
CHUNK SIZE 3000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 0%      0.732732          0.922064           0.708958   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.970775            0.88303            0.777719  0.832546  
Number of chunks for chunk size 3000, overlap 5%: 32


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  18%|█▊        | 11/60 [00:04<00:16,  2.89it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8258 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Evaluating:  33%|███▎      | 20/60 [00:04<00:07,  5.27it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8239 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context

The average score is: 0.8015986348093008
CHUNK SIZE 3000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 5%      0.908051          0.715152             0.9248   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.617445           0.852376            0.791767  0.801599  
Number of chunks for chunk size 3000, overlap 10%: 32


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   2%|▏         | 1/60 [00:02<02:09,  2.19s/it]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8258 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Evaluating:  95%|█████████▌| 57/60 [00:19<00:00,  5.55it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8572 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_

The average score is: 0.7963583688335518
CHUNK SIZE 3000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 10%      0.732732          0.920121           0.690971   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.783333           0.875408            0.775585  0.796358  
Number of chunks for chunk size 3000, overlap 15%: 32


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   5%|▌         | 3/60 [00:02<00:36,  1.56it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8258 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Evaluating:  97%|█████████▋| 58/60 [00:21<00:00,  3.69it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 8572 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_

The average score is: 0.7998798801496001
CHUNK SIZE 3000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 15%      0.729689          0.920638           0.653896   

   Context Recall  Answer Similarity  Answer Correctness  Average  
0        0.837898           0.882022            0.775137  0.79988  
Number of chunks for chunk size 3000, overlap 20%: 32


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 9248 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}
Evaluating:  20%|██        | 12/60 [00:04<00:14,  3.21it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 8192 tokens. However, your messages resulted in 9389 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_e

The average score is: 0.8231238890592086
CHUNK SIZE 3000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 20%      0.863351          0.770708           0.865681   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.915524           0.677045            0.846434  0.823124  


In [None]:
# # THE FIRST TIME RUN THIS, AFTER RUN THE NEXT CELL
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 50)
# chunks_1000_5 = text_splitter.split_documents(documents)
# print(len(chunks_1000))
# db_1000_5 = Chroma.from_documents(chunks_1000_5, embeddings_client, persist_directory = "../news/vectordb-edit/recursive_1000_5")
# db_1000_5.persist()

In [None]:
# retriever_1000_5 = db_1000_5.as_retriever()
# result_1000_5 = run_and_evaluate("Chunk 1000, overlap 0%",retriever_1000_5, prompt, llm, results_df)
# print("CHUNK SIZE 1000, 5% overlap")
# result_1000_5

In [None]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
# chunks_500 = text_splitter.split_documents(documents)
# print(len(chunks_500))
# db_500 = Chroma.from_documents(chunks_500, embeddings_client, persist_directory = "../vectordb/recursive_500")
# retriever_500 = db_500.as_retriever()
# result_500 = change_chunk_size(retriever_500)
# print("CHUNK SIZE 500")
# print(result_500)

In [None]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 200)
# chunks_2000 = text_splitter.split_documents(documents)
# print(len(chunks_2000))
# db_2000 = Chroma.from_documents(chunks_2000, embeddings_client, persist_directory = "../vectordb/recursive_2000")
# retriever_2000 = db_2000.as_retriever()
# result_2000 = change_chunk_size(retriever_2000)
# print("CHUNK SIZE 2000")
# print(result_2000)

In [None]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 3000, chunk_overlap = 300)
# chunks_3000 = text_splitter.split_documents(documents)
# print(len(chunks_3000))
# db_3000 = Chroma.from_documents(chunks_3000, embeddings_client, persist_directory = "../vectordb/recursive_3000")
# retriever_3000 = db_3000.as_retriever()
# result_3000 = change_chunk_size(retriever_3000)
# print("CHUNK SIZE 3000")
# print(result_3000)

In [28]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.375595,,,0.847668,0.45406,0.625387
1,GPT-4,0.75,0.360955,,,0.835999,0.415325,0.550658
2,Naive,0.904762,0.773143,0.9,0.98,0.902388,0.599976,0.843378
3,Recursive,0.916667,0.772477,0.9,0.98,0.902388,0.643756,0.852548
4,"Chunk 500, overlap 0%",1.0,0.837908,0.8,0.9,0.92037,0.565194,0.837245
5,"Chunk 500, overlap 5%",0.875,0.741242,0.847222,0.925,0.919578,0.625119,0.822194
6,"Chunk 500, overlap 10%",0.875,0.744905,0.85,0.95,0.917581,0.636998,0.829081
7,"Chunk 500, overlap 15%",0.888889,0.658863,0.9,0.9,0.912248,0.59828,0.809713
8,"Chunk 500, overlap 20%",0.907143,0.81964,0.891667,0.9375,0.91602,0.632423,0.850732
9,"Chunk 1000, overlap 0%",0.944444,0.797138,0.925,0.98,0.917733,0.622977,0.864549


In [29]:
highest_average = results_df["Average"].max()
print("Highest average value:", highest_average)

Highest average value: 0.8804388538467025


In [30]:
results_df.to_csv(f"../news/results/results.csv", index=False)

### now time to look for different top-k

Note: We continue with the size chunk of 1000 as it had the highest average score

In [32]:
db_k = Chroma(persist_directory = "../news/vectordb-edit/chunking_1000_20", embedding_function=embeddings_client)

In [33]:
k_values = [2, 3, 5]

# Iterate over different k values
for k in k_values:
    # Create retriever with k value
    retriever = db_k.as_retriever(search_kwargs={"k": k})
    
    # Run and evaluate
    result,results_df = run_and_evaluate(f"Chunk size {chunk_size}, overlap {overlap_percentage}%, K={k}", retriever, prompt, llm, results_df)
    print(f"Results for K={k}:")
    print(result)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.20it/s]


The average score is: 0.8366701197718719
Results for K=2:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 3000, overlap 20%, K=2      0.958333          0.731547   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0                0.9             1.0           0.906164            0.523977   

   Average  
0  0.83667  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-2058' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-2059' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-2060' coro=<AsyncClient.aclose() done, defi

The average score is: 0.8221172613620668
Results for K=3:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 3000, overlap 20%, K=3           1.0          0.640095   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.883333           0.925           0.916328            0.567948   

    Average  
0  0.822117  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.88it/s]


The average score is: 0.844671933282164
Results for K=5:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 3000, overlap 20%, K=5      0.968254          0.676828   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.903333             1.0            0.91622            0.603396   

    Average  
0  0.844672  


In [34]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.375595,,,0.847668,0.45406,0.625387
1,GPT-4,0.75,0.360955,,,0.835999,0.415325,0.550658
2,Naive,0.904762,0.773143,0.9,0.98,0.902388,0.599976,0.843378
3,Recursive,0.916667,0.772477,0.9,0.98,0.902388,0.643756,0.852548
4,"Chunk 500, overlap 0%",1.0,0.837908,0.8,0.9,0.92037,0.565194,0.837245
5,"Chunk 500, overlap 5%",0.875,0.741242,0.847222,0.925,0.919578,0.625119,0.822194
6,"Chunk 500, overlap 10%",0.875,0.744905,0.85,0.95,0.917581,0.636998,0.829081
7,"Chunk 500, overlap 15%",0.888889,0.658863,0.9,0.9,0.912248,0.59828,0.809713
8,"Chunk 500, overlap 20%",0.907143,0.81964,0.891667,0.9375,0.91602,0.632423,0.850732
9,"Chunk 1000, overlap 0%",0.944444,0.797138,0.925,0.98,0.917733,0.622977,0.864549


In [35]:
results_df.to_csv(f"../news/results/results.csv", index=False)

In [28]:
# retriever_2 = db.as_retriever(search_kwargs={"k": 2})
# result_2 = run_and_evaluate("K=2",retriever_2, prompt, llm, results_df)
# print("CHUNK SIZE 1000, K=2")
# print(result_2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:08<00:00,  6.88it/s]


CHUNK SIZE 1000, K=2
{'faithfulness': 1.0000, 'answer_relevancy': 0.7317, 'context_precision': 0.9000, 'context_recall': 1.0000, 'answer_similarity': 0.9071, 'answer_correctness': 0.5908}


In [29]:
# retriever_3 = db_1000.as_retriever(search_kwargs={"k": 3})
# result_3 = change_chunk_size(retriever_3)
# print("CHUNK SIZE 1000, K=3")
# print(result_3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.33it/s]


CHUNK SIZE 1000, K=3
{'faithfulness': 1.0000, 'answer_relevancy': 0.7832, 'context_precision': 0.9333, 'context_recall': 0.9250, 'answer_similarity': 0.9326, 'answer_correctness': 0.6117}


In [30]:
# retriever_5 = db_1000.as_retriever(search_kwargs={"k": 5})
# result_5 = change_chunk_size(retriever_5)
# print("CHUNK SIZE 1000, K=5")
# print(result_5)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:14<00:00,  4.03it/s]


CHUNK SIZE 1000, K=5
{'faithfulness': 0.9028, 'answer_relevancy': 0.6788, 'context_precision': 0.9333, 'context_recall': 1.0000, 'answer_similarity': 0.9250, 'answer_correctness': 0.6024}


### look for different retrievers

4 chunks was the best score

#### parent document retriever

In [36]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

parent_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap = 200)
child_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents",persist_directory = "../vectordb-edit/parent", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
parent_document_retriever.add_documents(documents)
result_parent, results_df = run_and_evaluate(f"Parent Retriever 1000-200", parent_document_retriever, prompt, llm, results_df)
print(result_parent)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.20it/s]


The average score is: 0.8288468817383213
                      System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 1000-200        0.9375          0.738172   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0               0.85             1.0            0.89614            0.551269   

    Average  
0  0.828847  


In [37]:
parent_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap = 50)
child_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_small",persist_directory = "../vectordb-edit/parent_small", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_small = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_small,
    parent_splitter=parent_splitter_small,
)
parent_document_retriever_small.add_documents(documents)
result_parent_small, results_df = run_and_evaluate(f"Parent Retriever 500-100", parent_document_retriever_small, prompt, llm, results_df)
print(result_parent_small)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  35%|███▌      | 21/60 [00:05<00:06,  5.58it/s]Runner in Executor raised an exception
ValueError: Azure has not provided the response due to a content filter being triggered
Evaluating: 100%|██████████| 60/60 [00:11<00:00,  5.01it/s]


The average score is: 0.860574218506804
                     System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 500-100      0.705718              0.75   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.952744        0.983333           0.962346            0.809304   

    Average  
0  0.860574  


In [38]:
parent_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1500, chunk_overlap = 150)
child_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_large",persist_directory = "../vectordb-edit/parent_large", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_large = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_large,
    parent_splitter=parent_splitter_large,
)
parent_document_retriever_large.add_documents(documents)
result_parent_large , results_df = run_and_evaluate(f"Parent Retriever 1500-200", parent_document_retriever_small, prompt, llm, results_df)
print(result_parent_large)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  35%|███▌      | 21/60 [00:05<00:06,  5.86it/s]Runner in Executor raised an exception
ValueError: Azure has not provided the response due to a content filter being triggered
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.46it/s]


The average score is: 0.8531451228062102
                      System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 1500-200      0.690791          0.718519   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.955095        0.983333           0.961934              0.8092   

    Average  
0  0.853145  


In [39]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.375595,,,0.847668,0.45406,0.625387
1,GPT-4,0.75,0.360955,,,0.835999,0.415325,0.550658
2,Naive,0.904762,0.773143,0.9,0.98,0.902388,0.599976,0.843378
3,Recursive,0.916667,0.772477,0.9,0.98,0.902388,0.643756,0.852548
4,"Chunk 500, overlap 0%",1.0,0.837908,0.8,0.9,0.92037,0.565194,0.837245
5,"Chunk 500, overlap 5%",0.875,0.741242,0.847222,0.925,0.919578,0.625119,0.822194
6,"Chunk 500, overlap 10%",0.875,0.744905,0.85,0.95,0.917581,0.636998,0.829081
7,"Chunk 500, overlap 15%",0.888889,0.658863,0.9,0.9,0.912248,0.59828,0.809713
8,"Chunk 500, overlap 20%",0.907143,0.81964,0.891667,0.9375,0.91602,0.632423,0.850732
9,"Chunk 1000, overlap 0%",0.944444,0.797138,0.925,0.98,0.917733,0.622977,0.864549


#### Maximum marginal relevance retrieval

In [40]:
retriever_mmr = db_k.as_retriever(search_type="mmr")
result_mmr, results_df = run_and_evaluate(f"MMR", retriever_mmr, prompt, llm, results_df)
print("Marginal relevance")
print(result_mmr)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.65it/s]


The average score is: 0.8353660265249804
Marginal relevance
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0    MMR           1.0          0.828457                0.9        0.719048   

   Answer Similarity  Answer Correctness   Average  
0           0.917804            0.646888  0.835366  


#### BM25

In [41]:
from langchain.retrievers import BM25Retriever
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 200)
chunks_1000_200 = text_splitter.split_documents(documents)
retriever_bm25 = BM25Retriever.from_documents(chunks_1000_200)
result_bm25, results_df = run_and_evaluate(f"BM25", retriever_bm25, prompt, llm, results_df)
print("BM25")
print(result_bm25)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.34it/s]


The average score is: 0.8412957004801213
BM25
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0   BM25        0.9375          0.790087                0.9        0.922857   

   Answer Similarity  Answer Correctness   Average  
0           0.914668            0.582662  0.841296  


#### Ensambler - Hybrid

In [42]:
from langchain.retrievers import EnsembleRetriever
ret = db_k.as_retriever()
ensemble_retriever_1 = EnsembleRetriever(retrievers=[retriever_bm25, ret], weights=[0.75, 0.25])
result_ensemble1, results_df = run_and_evaluate(f"Ensambler 1", ensemble_retriever_1, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble1)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  38%|███▊      | 23/60 [00:05<00:06,  5.60it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.27it/s]


The average score is: 0.8344842070383661
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 1           0.9          0.857714           0.838333   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.98           0.893071            0.537787  0.834484  


In [43]:
ensemble_retriever_2 = EnsembleRetriever(retrievers=[retriever_bm25, ret], weights=[0.5, 0.5])
result_ensemble2, results_df = run_and_evaluate(f"Ensambler 2", ensemble_retriever_2, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  37%|███▋      | 22/60 [00:05<00:07,  5.13it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.75it/s]


The average score is: 0.8636336690201237
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 2           1.0          0.748496           0.916667   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.98           0.921807            0.614832  0.863634  


In [44]:
ensemble_retriever_3 = EnsembleRetriever(retrievers=[retriever_bm25, ret], weights=[0.25,0.75])
result_ensemble3, results_df = run_and_evaluate(f"Ensambler 3", ensemble_retriever_3, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  33%|███▎      | 20/60 [00:05<00:09,  4.27it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 60/60 [00:10<00:00,  5.70it/s]


The average score is: 0.8241221463818186
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 3          0.93          0.653657               0.92   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0            0.98           0.917549            0.543527  0.824122  


In [45]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.375595,,,0.847668,0.45406,0.625387
1,GPT-4,0.75,0.360955,,,0.835999,0.415325,0.550658
2,Naive,0.904762,0.773143,0.9,0.98,0.902388,0.599976,0.843378
3,Recursive,0.916667,0.772477,0.9,0.98,0.902388,0.643756,0.852548
4,"Chunk 500, overlap 0%",1.0,0.837908,0.8,0.9,0.92037,0.565194,0.837245
5,"Chunk 500, overlap 5%",0.875,0.741242,0.847222,0.925,0.919578,0.625119,0.822194
6,"Chunk 500, overlap 10%",0.875,0.744905,0.85,0.95,0.917581,0.636998,0.829081
7,"Chunk 500, overlap 15%",0.888889,0.658863,0.9,0.9,0.912248,0.59828,0.809713
8,"Chunk 500, overlap 20%",0.907143,0.81964,0.891667,0.9375,0.91602,0.632423,0.850732
9,"Chunk 1000, overlap 0%",0.944444,0.797138,0.925,0.98,0.917733,0.622977,0.864549


#### Multi-stage - reranker

In [46]:
%pip install --upgrade --quiet  cohere

Note: you may need to restart the kernel to use updated packages.


In [47]:
import getpass

os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [67]:
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever

retriever_context = ret
compressor = CohereRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, 
    base_retriever=retriever_context
)

result_compression, results_df = run_and_evaluate(f"Cohere reranker", compression_retriever, prompt, llm, results_df)
print("Reranker")
print(result_compression)



TypeError: BaseCohere.rerank() takes 1 positional argument but 4 positional arguments (and 2 keyword-only arguments) were given

#### creating context by remaking the query

In [51]:
template_context = "Generate a search query to fetch the relevant documents using the user's {question}. Craft a query that specifically targets the keywords in the question. In the answer provide only the query."
prompt_context = ChatPromptTemplate.from_template(template_context)

In [57]:
answers_final = []
contexts_final = []
# retriever_context_q = EnsembleRetriever(retrievers=[retriever_bm25, retriever_3], weights=[0.5, 0.5])
llm_for_context =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm}
)
for query in questions:
    response_check = llm_for_context.invoke({"question": query})
    search_query = response_check["response"].content
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
)
    docs = ret.get_relevant_documents(search_query)
    formatted_docs = []
    for doc in docs:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final.append(response["response"].content)
            contexts_final.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final.append("No answer")
            contexts_final.append(formatted_docs)


result_search_query = evaluation_rag(questions, answers_final, contexts_final, ground_truths)
result_search_query

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.57it/s]


{'faithfulness': 0.9524, 'answer_relevancy': 0.6399, 'context_precision': 0.8833, 'context_recall': 0.9800, 'answer_similarity': 0.9106, 'answer_correctness': 0.5443}

In [59]:
average = dictionary(result_search_query)
    # Create a dictionary to store the results
system_results = {
        "System": "Search query",
        "Faithfulness": result_search_query["faithfulness"],
        "Answer Relevancy": result_search_query["answer_relevancy"],
        "Context Precision": result_search_query["context_precision"],
        "Context Recall": result_search_query["context_recall"],
        "Answer Similarity": result_search_query["answer_similarity"],
        "Answer Correctness": result_search_query["answer_correctness"],
        "Average": average
    }
df_result_search_query = pd.DataFrame([system_results])

The average score is: 0.8184278479648288


In [60]:
results_df = pd.concat([results_df, df_result_search_query], ignore_index=True)
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.375595,,,0.847668,0.45406,0.625387
1,GPT-4,0.75,0.360955,,,0.835999,0.415325,0.550658
2,Naive,0.904762,0.773143,0.9,0.98,0.902388,0.599976,0.843378
3,Recursive,0.916667,0.772477,0.9,0.98,0.902388,0.643756,0.852548
4,"Chunk 500, overlap 0%",1.0,0.837908,0.8,0.9,0.92037,0.565194,0.837245
5,"Chunk 500, overlap 5%",0.875,0.741242,0.847222,0.925,0.919578,0.625119,0.822194
6,"Chunk 500, overlap 10%",0.875,0.744905,0.85,0.95,0.917581,0.636998,0.829081
7,"Chunk 500, overlap 15%",0.888889,0.658863,0.9,0.9,0.912248,0.59828,0.809713
8,"Chunk 500, overlap 20%",0.907143,0.81964,0.891667,0.9375,0.91602,0.632423,0.850732
9,"Chunk 1000, overlap 0%",0.944444,0.797138,0.925,0.98,0.917733,0.622977,0.864549


### change model to GPT-4

In [53]:
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [54]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,1.0,0.375595,,,0.847668,0.45406,0.625387
1,GPT-4,0.75,0.360955,,,0.835999,0.415325,0.550658
2,Naive,0.904762,0.773143,0.9,0.98,0.902388,0.599976,0.843378
3,Recursive,0.916667,0.772477,0.9,0.98,0.902388,0.643756,0.852548
4,"Chunk 500, overlap 0%",1.0,0.837908,0.8,0.9,0.92037,0.565194,0.837245
5,"Chunk 500, overlap 5%",0.875,0.741242,0.847222,0.925,0.919578,0.625119,0.822194
6,"Chunk 500, overlap 10%",0.875,0.744905,0.85,0.95,0.917581,0.636998,0.829081
7,"Chunk 500, overlap 15%",0.888889,0.658863,0.9,0.9,0.912248,0.59828,0.809713
8,"Chunk 500, overlap 20%",0.907143,0.81964,0.891667,0.9375,0.91602,0.632423,0.850732
9,"Chunk 1000, overlap 0%",0.944444,0.797138,0.925,0.98,0.917733,0.622977,0.864549


In [55]:
result_gpt4_1000_20, results_df = run_and_evaluate(f"Chunk 1000, overlap 20%, GPT-4", ret, prompt, llm_gpt4, results_df)
result_gpt4_1000_20

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.90it/s]


The average score is: 0.8626577082002025


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk 1000, overlap 20%, GPT-4",1.0,0.711227,0.883333,0.98,0.925291,0.676095,0.862658


In [62]:
top_3_highest = results_df.nlargest(3, "Average")
top_3_highest

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
13,"Chunk 1000, overlap 20%",1.0,0.889265,0.883333,0.98,0.92067,0.609364,0.880439
11,"Chunk 1000, overlap 10%",1.0,0.781964,0.933333,0.98,0.92265,0.653329,0.878546
9,"Chunk 1000, overlap 0%",0.944444,0.797138,0.925,0.98,0.917733,0.622977,0.864549


In [63]:
db_0 = Chroma(persist_directory = "../news/vectordb-edit/chunking_1000_0", embedding_function=embeddings_client)
ret_0 = db_0.as_retriever()
result_search_query_gpt4, results_df = run_and_evaluate(f"Chunk 1000, overlap 0%, GPT-4", ret_0, prompt, llm_gpt4, results_df)
result_search_query_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:09<00:00,  6.27it/s]


The average score is: 0.8810247743091507
This is the new best value!


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk 1000, overlap 0%, GPT-4",1.0,0.80883,0.925,0.98,0.92286,0.649458,0.881025


In [64]:
db_10 = Chroma(persist_directory = "../news/vectordb-edit/chunking_1000_10", embedding_function=embeddings_client)
ret_10 = db_10.as_retriever()
result_search_query_gpt4, results_df = run_and_evaluate(f"Chunk 1000, overlap 10%, GPT-4", ret_10, prompt, llm_gpt4, results_df)
result_search_query_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  38%|███▊      | 23/60 [00:04<00:05,  6.52it/s]Runner in Executor raised an exception
ValueError: Azure has not provided the response due to a content filter being triggered
Evaluating: 100%|██████████| 60/60 [00:12<00:00,  4.64it/s]


The average score is: 0.8557522471755882


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Chunk 1000, overlap 10%, GPT-4",0.688379,0.805449,0.832043,1.0,0.955418,0.853223,0.855752


In [65]:
results_df.to_csv(f"../news/results/results_qa_news.csv", index=False)