In [1]:
!pip install pypdf

In [2]:
import dotenv
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings
dotenv.load_dotenv()
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness,
)
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
import os
import pandas as pd
import numpy as np
import sys
sys.tracebacklimit = 0
from langchain_community.document_loaders import PyPDFLoader


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL")
EMBEDDING_DEPLOYMENT = os.environ.get("EMBEDDING_DEPLOYMENT")
OPENAI_MODEL_GPT4 = os.environ.get("OPENAI_MODEL_GPT4")
OPENAI_DEPLOYMENT_GPT4 = os.environ.get("OPENAI_DEPLOYMENT_GPT4")

In [4]:
questions = [
    "Who wrote 'The Hanging Tree' song?",
    "Which District was the source of the Snow family's wealth before the war?",
    "Which district did the Plinth family call home before moving to the Capitol?",
    "What is Lucy Gray wearing when she first appears in 'The Ballad of Songbirds and Snakes'?",
    "Where are the tributes sent immediately after they arrive in the Capitol in 'The Ballad of Songbirds and Snakes'?",
    "What does Coriolanus do to keep Lucy Gray safe when he realizes Dr. Gaul is putting her hybrid snakes in the arena in 'The Ballad of Songbirds and Snakes'?",
    "What does Coriolanus do that causes Sejanus's execution in 'The Ballad of Songbirds and Snakes'?",
    "What is one of the main reasons Coriolanus is chosen to sing the national anthem at Arachne's funeral in 'The Ballad of Songbirds and Snakes'?",
    "Who arrives at the District 12 Peacekeeper base soon after Coriolanus in 'The Ballad of Songbirds and Snakes'?",
    "Where did the jabberjays originate in 'The Ballad of Songbirds and Snakes'?",
    "Why does Coriolanus feel he needs to kill Lucy Gray?",
    "What does Coriolanus do to keep Lucy Gray safe when he realizes Dr. Gaul is putting her hybrid snakes in the arena?",
    "Coriolanus regularly receives a box from Mrs. Plinth in 'The Ballad of Songbirds and Snakes'. What is in the box?",
    "What is Sejanus sprinkling on Marcus's body in the arena in 'The Ballad of Songbirds and Snakes'?",
    "What does Coriolanus wear in his lapel at the interview to remind everyone that Lucy Gray 'belongs to' him in 'The Ballad of Songbirds and Snakes'?"
]
ground_truths = [
    ["Lucy Gray Baird from District 12."],
    ["District 13."],
    ["District 2."],
    ["A ruffled dress in rainbow colors."],
    ["The monkey house in the zoo."],
    ["He drops a handkerchief with Lucy Gray's scent into the snake tank."],
    ["He uses a jabberjay to record Sejanus talking about his part in a rebel plan."],
    ["He knows all the words."],
    ["Sejanus."],
    ["In Dr. Gaul's lab."],
    ["She can tie him to Mayfair's murder."],
    ["Drops a handkerchief with Lucy Gray's scent into the snake tank."],
    ["Baked goods."],
    ["Breadcrumbs so he will have food to eat during his journey."],
    ["A rose that matches the one in her hair."]
]
answers_llm = []
contexts_llm = [[""],[""],[""],[""],[""],[""],[""],[""],[""],[""],[""],[""],[""],[""],[""]]

In [5]:
embeddings_client = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_version=OPENAI_API_VERSION)
llm = AzureChatOpenAI(model_name=OPENAI_MODEL, azure_deployment=OPENAI_DEPLOYMENT,temperature=0)
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [6]:
def evaluation_llm(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
    )
    return result

In [7]:
def evaluation_rag(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  # most likely
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

In [8]:
columns = ["System", "Faithfulness", "Answer Relevancy", "Context Precision", "Context Recall", "Answer Similarity", "Answer Correctness"]
results_df = pd.DataFrame(columns=columns)

In [9]:
max_average = 0
def find_highest(average_score):
    global max_average
    if average_score > max_average:
        max_average = average_score
        print("This is the new best value!")

In [10]:
def dictionary(result):
    dict_result = dict(result)
    average_score = sum(dict_result.values()) / len(dict_result)
    print(f"The average score is: {average_score}")
    find_highest(average_score)
    return average_score

In [11]:
def evaluate_system(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_rag(questions, answers, contexts, ground_truths)
    average = dictionary(result)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Faithfulness": result["faithfulness"],
        "Answer Relevancy": result["answer_relevancy"],
        "Context Precision": result["context_precision"],
        "Context Recall": result["context_recall"],
        "Answer Similarity": result["answer_similarity"],
        "Answer Correctness": result["answer_correctness"],
        "Average": average
    }
    df_system_results = pd.DataFrame([system_results])
    return df_system_results

In [12]:
def evaluate_LLM(system_name, questions, answers, contexts, ground_truths):
    result = evaluation_rag(questions, answers, contexts, ground_truths)
    average = dictionary(result)
    # Create a dictionary to store the results
    system_results = {
        "System": system_name,
        "Faithfulness": result["faithfulness"],
        "Answer Relevancy": result["answer_relevancy"],
        "Context Precision": np.nan,
        "Context Recall": np.nan,
        "Answer Similarity": result["answer_similarity"],
        "Answer Correctness": result["answer_correctness"],
        "Average": average
    }
    df_llm_results = pd.DataFrame([system_results])
    return df_llm_results

### General answers by LLM

In [13]:
template = """Write a concise answer to the following question: {question}"""
prompt = ChatPromptTemplate.from_template(template)

In [14]:
llm_chain =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm}
)
llm_chain_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm_gpt4}
)

In [15]:
for query in questions:
    response = llm_chain.invoke({"question": query})
    answers_llm.append(response["response"].content)

In [16]:
llm_results = evaluate_LLM("GPT-3.5", questions, answers_llm, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results], ignore_index=True)
print(llm_results)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 90/90 [00:06<00:00, 14.36it/s]


The average score is: 0.3905568657193135
This is the new best value!
    System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  GPT-3.5      0.083333          0.810294                NaN             NaN   

   Answer Similarity  Answer Correctness   Average  
0           0.793104            0.256609  0.390557  


  results_df = pd.concat([results_df, llm_results], ignore_index=True)


In [17]:
answers_llm_gpt4 = []
for query in questions:
    response = llm_chain_gpt4.invoke({"question": query})
    answers_llm_gpt4.append(response["response"].content)
llm_results_gpt4 = evaluate_LLM("GPT-4",questions, answers_llm_gpt4, contexts_llm, ground_truths)
results_df = pd.concat([results_df, llm_results_gpt4], ignore_index=True)
print(llm_results_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 90/90 [00:06<00:00, 12.98it/s]


The average score is: 0.5072359313320577
This is the new best value!
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  GPT-4      0.666667          0.887922                NaN             NaN   

   Answer Similarity  Answer Correctness   Average  
0           0.801728            0.287099  0.507236  


In [18]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.083333,0.810294,,,0.793104,0.256609,0.390557
1,GPT-4,0.666667,0.887922,,,0.801728,0.287099,0.507236


### Naive RAG

In [19]:
loader = PyPDFLoader(r"..\ballad\the_ballad_of_songbirds_and_snakes.pdf")
documents = loader.load()

In [20]:
text_splitter = CharacterTextSplitter()
chunks = text_splitter.split_documents(documents)
db_naive = Chroma.from_documents(chunks, embeddings_client, persist_directory = "../ballad/vectordb-edit/naive")
db_naive.persist()
retriever_naive = db_naive.as_retriever()

In [21]:
template = """User input {question}. 
Context {context}."""
prompt = ChatPromptTemplate.from_template(template)

In [22]:
def retrieval_chain(prompt, retriever, llm):
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
    )
    return retrieval_augmented_qa_chain

In [23]:
answers_naive = []
contexts_naive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_naive, llm).invoke({"question": query})
        # Access the response content
        answers_naive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_naive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_naive.append("No answer")
        context_full = retriever_naive.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_naive.append(context_content)

In [24]:
result_naive_rag = evaluate_system("Naive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_naive_rag], ignore_index=True)
print(result_naive_rag)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': True, 'severity': 'medium'}, 'violence': {'filtered': False, 'severity': 

The average score is: 0.5548861942589303
This is the new best value!
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0  Naive      0.558016          0.659556           0.787525        0.514306   

   Answer Similarity  Answer Correctness   Average  
0           0.280777            0.529137  0.554886  


## try recursive text splitter

In [25]:
text_splitter = text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
chunks_r = text_splitter.split_documents(documents)
db_basic = Chroma.from_documents(chunks_r, embeddings_client, persist_directory = "../ballad/vectordb-edit/recursive_basic")
db_basic.persist()
retriever_basic = db_basic.as_retriever()

In [26]:
answers_recursive = []
contexts_recursive = []
for query in questions:
    try:  
        response = retrieval_chain(prompt, retriever_basic, llm).invoke({"question": query})
        # Access the response content
        answers_recursive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_recursive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_recursive.append("No answer")
        context_full = retriever_basic.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_recursive.append(context_content)

In [27]:
result_recursive = evaluate_system("Recursive", questions, answers_naive, contexts_naive, ground_truths)
results_df = pd.concat([results_df, result_recursive], ignore_index=True)
result_recursive

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': True, 'severity': 'medium'}, 'violence': {'filtered': False, 'severity': 

The average score is: 0.531828253100994


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,Recursive,0.379104,0.744159,0.721819,0.383333,0.288185,0.67437,0.531828


## chunk size change

In [30]:
def run_and_evaluate(name, retriever, prompt, llm, results_df):
    answers = []
    contexts_extra = []

    for query in questions:
        try:  
            response = retrieval_chain(prompt, retriever, llm).invoke({"question": query})
            # Access the response content
            answers.append(response["response"].content)
            # Access the context content
            context_content = [context.page_content for context in response["context"]]
            contexts_extra.append(context_content)  
        except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers.append("No answer")
            context_full = retriever.get_relevant_documents(query)
            context_content = [context.page_content for context in context_full]
            contexts_extra.append(context_content)

    result = evaluate_system(name, questions, answers, contexts_extra, ground_truths)
    results_df = pd.concat([results_df, result], ignore_index=True)
    return result, results_df

In [31]:
chunk_sizes = [500, 1000, 2000, 3000]
overlap_percentages = [0, 5, 10, 15, 20]

for chunk_size in chunk_sizes:
    for overlap_percentage in overlap_percentages:
        # Calculate overlap based on percentage
        chunk_overlap = int(chunk_size * overlap_percentage / 100)
        
        # Create text splitter
        text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        
        # Split documents
        chunks = text_splitter.split_documents(documents)
        
        # Print number of chunks
        print(f"Number of chunks for chunk size {chunk_size}, overlap {overlap_percentage}%: {len(chunks)}")
        
        # Create Chroma database
        db = Chroma.from_documents(chunks, embeddings_client, persist_directory=f"../ballad/vectordb-edit/chunking_{chunk_size}_{overlap_percentage}")
        db.persist()
        
        # Create retriever
        retriever = db.as_retriever()
        
        # Run and evaluate
        result,results_df = run_and_evaluate(f"Chunk {chunk_size}, overlap {overlap_percentage}%", retriever, prompt, llm, results_df)
        print(f"CHUNK SIZE {chunk_size}, {overlap_percentage}% overlap:")
        print(result)

Number of chunks for chunk size 500, overlap 0%: 735


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

The average score is: 0.46003883362226006
CHUNK SIZE 500, 0% overlap:
                  System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 0%      0.517857          0.088874            0.78231   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.277026           0.513533            0.580632  0.460039  
Number of chunks for chunk size 500, overlap 5%: 735


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

The average score is: 0.5292776333401444
CHUNK SIZE 500, 5% overlap:
                  System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 5%      0.565476          0.218776           0.801542   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.346715           0.585408            0.657749  0.529278  
Number of chunks for chunk size 500, overlap 10%: 735


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-863' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-864' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-865' coro=<AsyncClient.aclose() done, defined

The average score is: 0.5695708844336719
This is the new best value!
CHUNK SIZE 500, 10% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 10%      0.652778          0.230681           0.793275   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.326791           0.746519            0.667383  0.569571  
Number of chunks for chunk size 500, overlap 15%: 735


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

The average score is: 0.5361153121892916
CHUNK SIZE 500, 15% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 15%      0.841746          0.535714           0.107143   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.812384           0.341578            0.578127  0.536115  
Number of chunks for chunk size 500, overlap 20%: 735


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

The average score is: 0.4721818772509434
CHUNK SIZE 500, 20% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 500, overlap 20%       0.63609          0.486717           0.237498   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.684846           0.471942            0.315999  0.472182  
Number of chunks for chunk size 1000, overlap 0%: 413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': True, 'severity': 'medium'}, 'violence': {'filtered': False, 'severity': 

The average score is: 0.5481516604206526
CHUNK SIZE 1000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 0%      0.364984          0.852743           0.731757   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0         0.37963           0.288185            0.671611  0.548152  
Number of chunks for chunk size 1000, overlap 5%: 413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   6%|▌         | 5/90 [00:05<01:35,  1.13s/it]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'sever

The average score is: 0.5067525562246075
CHUNK SIZE 1000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 5%       0.39897          0.648575           0.641253   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.398148           0.383179            0.570392  0.506753  
Number of chunks for chunk size 1000, overlap 10%: 413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-1614' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-1615' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-1616' coro=<AsyncClient.aclose() done, defi

The average score is: 0.533673776620536
CHUNK SIZE 1000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 10%      0.362868           0.77005           0.729893   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0         0.37963           0.288185            0.671417  0.533674  
Number of chunks for chunk size 1000, overlap 15%: 413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': True, 'severity': 'medium'}, 'violence': {'filtered': False, 'severity': 

The average score is: 0.5529962579810227
CHUNK SIZE 1000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 15%      0.544835          0.650798           0.793733   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.519767           0.277074            0.531771  0.552996  
Number of chunks for chunk size 1000, overlap 20%: 413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  87%|████████▋ | 78/90 [00:11<00:01,  6.30it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 90/90 [00:13<00:00,  6.55it/s]


The average score is: 0.5154238086290909
CHUNK SIZE 1000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 1000, overlap 20%      0.642857          0.741209           0.412963   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.233333           0.795649            0.266531  0.515424  
Number of chunks for chunk size 2000, overlap 0%: 413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': True, 'severity': 'medium'}, 'violence': {'filtered': False, 'severity': 

The average score is: 0.5302768916054704
CHUNK SIZE 2000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 0%      0.346149          0.829901           0.662909   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0         0.37963           0.288185            0.674888  0.530277  
Number of chunks for chunk size 2000, overlap 5%: 413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 90/90 [00:15<00:00,  5.89it/s]


The average score is: 0.5411518162008683
CHUNK SIZE 2000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 5%      0.686905          0.817278           0.407407   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             0.3           0.790828            0.244493  0.541152  
Number of chunks for chunk size 2000, overlap 10%: 413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  44%|████▍     | 40/90 [00:06<00:03, 14.37it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 90/90 [00:12<00:00,  7.22it/s]


The average score is: 0.5286489780317086
CHUNK SIZE 2000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 10%      0.665873          0.815407           0.374074   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             0.3           0.787232            0.229308  0.528649  
Number of chunks for chunk size 2000, overlap 15%: 413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  79%|███████▉  | 71/90 [00:10<00:01,  9.62it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 90/90 [00:13<00:00,  6.89it/s]


The average score is: 0.5531542398095859
CHUNK SIZE 2000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 15%      0.739286          0.801774            0.37963   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             0.3           0.791637              0.3066  0.553154  
Number of chunks for chunk size 2000, overlap 20%: 413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   6%|▌         | 5/90 [00:02<00:30,  2.79it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'sever

The average score is: 0.5719441699627786
This is the new best value!
CHUNK SIZE 2000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 2000, overlap 20%       0.75821          0.526721           0.722255   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.730503           0.456451            0.237525  0.571944  
Number of chunks for chunk size 3000, overlap 0%: 413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': True, 'severity': 'medium'}, 'violence': {'filtered': False, 'severity': 

The average score is: 0.5098492236396049
CHUNK SIZE 3000, 0% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 0%      0.336367          0.707624           0.672505   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0         0.37963           0.288185            0.674785  0.509849  
Number of chunks for chunk size 3000, overlap 5%: 413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': True, 'severity': 'medium'}, 'violence': {'filtered': False, 'severity': 

The average score is: 0.5200598611406061
CHUNK SIZE 3000, 5% overlap:
                   System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 5%      0.336423          0.794321           0.669045   

   Context Recall  Answer Similarity  Answer Correctness  Average  
0        0.357407           0.288185            0.674978  0.52006  
Number of chunks for chunk size 3000, overlap 10%: 413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': True, 'severity': 'medium'}, 'violence': {'filtered': False, 'severity': 

The average score is: 0.5302058967814511
CHUNK SIZE 3000, 10% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 10%      0.387645          0.713186           0.734147   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0         0.37963           0.291758             0.67487  0.530206  
Number of chunks for chunk size 3000, overlap 15%: 413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  93%|█████████▎| 84/90 [00:12<00:00, 18.09it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 90/90 [00:16<00:00,  5.62it/s]


The average score is: 0.5504475406090757
CHUNK SIZE 3000, 15% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 15%       0.78836          0.741365           0.374074   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0             0.3           0.792157             0.30673  0.550448  
Number of chunks for chunk size 3000, overlap 20%: 413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  89%|████████▉ | 80/90 [00:22<00:04,  2.35it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 90/90 [00:23<00:00,  3.85it/s]


The average score is: 0.5278333436218005
CHUNK SIZE 3000, 20% overlap:
                    System  Faithfulness  Answer Relevancy  Context Precision  \
0  Chunk 3000, overlap 20%       0.80754          0.677429           0.357407   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.266667           0.793699            0.264258  0.527833  


In [None]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
# chunks_1000 = text_splitter.split_documents(documents)
# print(len(chunks_1000))
# db_1000 = Chroma.from_documents(chunks_1000, embeddings_client, persist_directory = "../ballad/vectordb-edit/recursive_1000")
# db_1000.persist()
# retriever_1000 = db_1000.as_retriever()
# result_1000 = change_chunk_size(retriever_1000)
# print("CHUNK SIZE 1000")
# print(result_1000)

In [None]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
# chunks_500 = text_splitter.split_documents(documents)
# print(len(chunks_500))
# db_500 = Chroma.from_documents(chunks_500, embeddings_client, persist_directory = "../ballad/vectordb-edit/recursive_500")
# db_500.persist()
# retriever_500 = db_500.as_retriever()
# result_500 = change_chunk_size(retriever_500)
# print("CHUNK SIZE 500")
# print(result_500)

In [None]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 200)
# chunks_2000 = text_splitter.split_documents(documents)
# print(len(chunks_2000))
# db_2000 = Chroma.from_documents(chunks_2000, embeddings_client, persist_directory = "../ballad/vectordb-edit/recursive_2000")
# db_2000.persist()
# retriever_2000 = db_2000.as_retriever()
# result_2000 = change_chunk_size(retriever_2000)
# print("CHUNK SIZE 2000")
# print(result_2000)

In [None]:
# text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 3000, chunk_overlap = 300)
# chunks_3000 = text_splitter.split_documents(documents)
# print(len(chunks_3000))
# db_3000 = Chroma.from_documents(chunks_3000, embeddings_client, persist_directory = "../ballad/vectordb-edit/recursive_3000")
# db_3000.persist()
# retriever_3000 = db_3000.as_retriever()
# result_3000 = change_chunk_size(retriever_3000)
# print("CHUNK SIZE 3000")
# print(result_3000)

In [32]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.083333,0.810294,,,0.793104,0.256609,0.390557
1,GPT-4,0.666667,0.887922,,,0.801728,0.287099,0.507236
2,Naive,0.558016,0.659556,0.787525,0.514306,0.280777,0.529137,0.554886
3,Recursive,0.379104,0.744159,0.721819,0.383333,0.288185,0.67437,0.531828
4,"Chunk 500, overlap 0%",0.517857,0.088874,0.78231,0.277026,0.513533,0.580632,0.460039
5,"Chunk 500, overlap 5%",0.565476,0.218776,0.801542,0.346715,0.585408,0.657749,0.529278
6,"Chunk 500, overlap 10%",0.652778,0.230681,0.793275,0.326791,0.746519,0.667383,0.569571
7,"Chunk 500, overlap 15%",0.841746,0.535714,0.107143,0.812384,0.341578,0.578127,0.536115
8,"Chunk 500, overlap 20%",0.63609,0.486717,0.237498,0.684846,0.471942,0.315999,0.472182
9,"Chunk 1000, overlap 0%",0.364984,0.852743,0.731757,0.37963,0.288185,0.671611,0.548152


In [34]:
highest_average = results_df["Average"].max()
print("Highest average value:", highest_average)

Highest average value: 0.5719441699627786


In [33]:
results_df.to_csv(f"../ballad/results/results_qa.csv", index=False)

### now time to look for different top-k

Note: We continue with the size chunk of 500 as it had the highest average score

In [35]:
db_k = Chroma(persist_directory = "../ballad/vectordb-edit/chunking_2000_20", embedding_function=embeddings_client)

In [36]:
k_values = [2, 3, 5, 6]

# Iterate over different k values
for k in k_values:
    # Create retriever with k value
    retriever = db_k.as_retriever(search_kwargs={"k": k})
    
    # Run and evaluate
    result,results_df = run_and_evaluate(f"Chunk size {chunk_size}, overlap {overlap_percentage}%, K={k}", retriever, prompt, llm, results_df)
    print(f"Results for K={k}:")
    print(result)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   1%|          | 1/90 [00:04<06:56,  4.68s/it]Runner in Executor raised an exception
TypeError: expected string or buffer
Evaluating:  79%|███████▉  | 71/90 [00:05<00:00, 22.18it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 90/90 [00:06<00:00, 14.76it/s]


The average score is: 0.41005265498550814
Results for K=2:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 3000, overlap 20%, K=2       0.41197          0.282099   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.579457        0.266667           0.326902            0.593221   

    Average  
0  0.410053  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-3487' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-3488' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-3489' coro=<AsyncClient.aclose() done, defi

The average score is: 0.4294029879648697
Results for K=3:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 3000, overlap 20%, K=3      0.261245          0.443421   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.594656        0.327778           0.273538             0.67578   

    Average  
0  0.429403  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 90/90 [00:12<00:00,  7.31it/s]


The average score is: 0.519104684815868
Results for K=5:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 3000, overlap 20%, K=5      0.740741          0.806737   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.395926        0.116667           0.793101            0.261457   

    Average  
0  0.519105  


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Task exception was never retrieved
future: <Task finished name='Task-3749' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-3750' coro=<AsyncClient.aclose() done, defined at c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\httpx\_client.py:2011> exception=RuntimeError('Event loop is closed')>
RuntimeError: Event loop is closed
Task exception was never retrieved
future: <Task finished name='Task-3751' coro=<AsyncClient.aclose() done, defi

The average score is: 0.5558102541492808
Results for K=6:
                              System  Faithfulness  Answer Relevancy  \
0  Chunk size 3000, overlap 20%, K=6      0.447991          0.224892   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.821103        0.390395           0.701033            0.749447   

   Average  
0  0.55581  


### look for different retrievers

7 chunks was the best score

In [37]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

parent_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap = 200)
child_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents",persist_directory = "../ballad/vectordb-edit/parent", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)
parent_document_retriever.add_documents(documents)
result_parent, results_df = run_and_evaluate(f"Parent Retriever 1000-200", parent_document_retriever, prompt, llm, results_df)
print(result_parent)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.572733279936672
This is the new best value!
                      System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 1000-200      0.408467          0.668263   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.598895        0.679993            0.64895            0.431832   

    Average  
0  0.572733  


In [38]:
parent_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=500, chunk_overlap = 50)
child_splitter_small = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_small",persist_directory = "../ballad/vectordb-edit/parent_small", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_small = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_small,
    parent_splitter=parent_splitter_small,
)
parent_document_retriever_small.add_documents(documents)
result_parent_small, results_df = run_and_evaluate(f"Parent Retriever 500-100", parent_document_retriever_small, prompt, llm, results_df)
print(result_parent_small)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  29%|██▉       | 26/90 [00:08<00:14,  4.40it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'seve

The average score is: 0.5350661417942891
                     System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 500-100      0.529343          0.496213   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.604886        0.529791           0.428873            0.621292   

    Average  
0  0.535066  


In [39]:
parent_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1500, chunk_overlap = 150)
child_splitter_large = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=200, chunk_overlap = 0)

vectorstore = Chroma(collection_name="split_parents_large",persist_directory = "../ballad/vectordb-edit/parent_large", embedding_function=embeddings_client)
store = InMemoryStore()
parent_document_retriever_large = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter_large,
    parent_splitter=parent_splitter_large,
)
parent_document_retriever_large.add_documents(documents)
result_parent_large , results_df = run_and_evaluate(f"Parent Retriever 1500-200", parent_document_retriever_small, prompt, llm, results_df)
print(result_parent_large)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
ValueError: Azure has not provided the response due to a content filter being triggered
Evaluating:   1%|          | 1/90 [00:01<02:31,  1.70s/it]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filt

The average score is: 0.5444346678198149
                      System  Faithfulness  Answer Relevancy  \
0  Parent Retriever 1500-200      0.668834          0.381221   

   Context Precision  Context Recall  Answer Similarity  Answer Correctness  \
0           0.648756        0.497418           0.475129             0.59525   

    Average  
0  0.544435  


In [40]:
highest_average = results_df["Average"].max()
print("Highest average value:", highest_average)

Highest average value: 0.572733279936672


In [41]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.083333,0.810294,,,0.793104,0.256609,0.390557
1,GPT-4,0.666667,0.887922,,,0.801728,0.287099,0.507236
2,Naive,0.558016,0.659556,0.787525,0.514306,0.280777,0.529137,0.554886
3,Recursive,0.379104,0.744159,0.721819,0.383333,0.288185,0.67437,0.531828
4,"Chunk 500, overlap 0%",0.517857,0.088874,0.78231,0.277026,0.513533,0.580632,0.460039
5,"Chunk 500, overlap 5%",0.565476,0.218776,0.801542,0.346715,0.585408,0.657749,0.529278
6,"Chunk 500, overlap 10%",0.652778,0.230681,0.793275,0.326791,0.746519,0.667383,0.569571
7,"Chunk 500, overlap 15%",0.841746,0.535714,0.107143,0.812384,0.341578,0.578127,0.536115
8,"Chunk 500, overlap 20%",0.63609,0.486717,0.237498,0.684846,0.471942,0.315999,0.472182
9,"Chunk 1000, overlap 0%",0.364984,0.852743,0.731757,0.37963,0.288185,0.671611,0.548152


In [46]:
replacement_map = {
    'Chunk size 1000, 0%, K=2': 'Chunk size 2000, 20%, K=2',
'Chunk size 1000, 0%, K=3':'Chunk size 2000, 20%, K=3',
     'Chunk size 1000, 0%, K=5':'Chunk size 2000, 20%, K=5',
    'Chunk size 1000, 0%, K=6':'Chunk size 2000, 20%, K=6'
}
results_df['System'] = results_df['System'].replace(replacement_map)

In [47]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.083333,0.810294,,,0.793104,0.256609,0.390557
1,GPT-4,0.666667,0.887922,,,0.801728,0.287099,0.507236
2,Naive,0.558016,0.659556,0.787525,0.514306,0.280777,0.529137,0.554886
3,Recursive,0.379104,0.744159,0.721819,0.383333,0.288185,0.67437,0.531828
4,"Chunk 500, overlap 0%",0.517857,0.088874,0.78231,0.277026,0.513533,0.580632,0.460039
5,"Chunk 500, overlap 5%",0.565476,0.218776,0.801542,0.346715,0.585408,0.657749,0.529278
6,"Chunk 500, overlap 10%",0.652778,0.230681,0.793275,0.326791,0.746519,0.667383,0.569571
7,"Chunk 500, overlap 15%",0.841746,0.535714,0.107143,0.812384,0.341578,0.578127,0.536115
8,"Chunk 500, overlap 20%",0.63609,0.486717,0.237498,0.684846,0.471942,0.315999,0.472182
9,"Chunk 1000, overlap 0%",0.364984,0.852743,0.731757,0.37963,0.288185,0.671611,0.548152


#### Maximum marginal relevance retrieval

In [48]:
retriever_mmr = db_k.as_retriever(search_type="mmr")
result_mmr, results_df = run_and_evaluate(f"MMR", retriever_mmr, prompt, llm, results_df)
print("Marginal relevance")
print(result_mmr)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  78%|███████▊  | 70/90 [00:06<00:00, 28.61it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': True, 'severity': 'medium'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'se

The average score is: 0.47590768652696824
Marginal relevance
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0    MMR      0.722222          0.611605           0.395734        0.123333   

   Answer Similarity  Answer Correctness   Average  
0           0.786041             0.21651  0.475908  


#### BM25

In [49]:
highest_average = results_df["Average"].max()
print("Highest average value:", highest_average)

Highest average value: 0.572733279936672


In [50]:
from langchain.retrievers import BM25Retriever
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 400)
chunks_1000 = text_splitter.split_documents(documents)
retriever_bm25 = BM25Retriever.from_documents(chunks_1000)
result_bm25, results_df = run_and_evaluate(f"BM25", retriever_bm25, prompt, llm, results_df)
print("BM25")
print(result_bm25)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

The average score is: 0.41237378448922163
BM25
  System  Faithfulness  Answer Relevancy  Context Precision  Context Recall  \
0   BM25      0.458274          0.461641           0.326519        0.312199   

   Answer Similarity  Answer Correctness   Average  
0           0.585142            0.330468  0.412374  


#### Ensambler - Hybrid

In [51]:
ret = db_k.as_retriever()

In [52]:
from langchain.retrievers import EnsembleRetriever

ensemble_retriever_1 = EnsembleRetriever(retrievers=[retriever_bm25, ret], weights=[0.75, 0.25])
result_ensemble1, results_df = run_and_evaluate(f"Ensambler 1", ensemble_retriever_1, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble1)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

The average score is: 0.5803926758896713
This is the new best value!
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 1      0.478271          0.703551           0.671269   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.673675           0.566497            0.389093  0.580393  


In [53]:
ensemble_retriever_2 = EnsembleRetriever(retrievers=[retriever_bm25, ret], weights=[0.5, 0.5])
result_ensemble2, results_df = run_and_evaluate(f"Ensambler 2", ensemble_retriever_2, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble2)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

The average score is: 0.5868242857901301
This is the new best value!
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 2      0.621946          0.429459           0.584322   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.682494           0.625598            0.577126  0.586824  


In [54]:
ensemble_retriever_3 = EnsembleRetriever(retrievers=[retriever_bm25, ret], weights=[0.25,0.75])
result_ensemble3, results_df = run_and_evaluate(f"Ensambler 3", ensemble_retriever_3, prompt, llm, results_df)
print("Ensambler")
print(result_ensemble3)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.6292721453464117
This is the new best value!
Ensambler
        System  Faithfulness  Answer Relevancy  Context Precision  \
0  Ensambler 3      0.614624           0.72338           0.801633   

   Context Recall  Answer Similarity  Answer Correctness   Average  
0        0.402776           0.611634            0.621585  0.629272  


In [55]:
results_df.to_csv(f"../ballad/results/results_qa.csv")

#### Multi-stage - reranker

In [19]:
import getpass

os.environ["COHERE_API_KEY"] = getpass.getpass("Cohere API Key:")

In [29]:
questions = [
    "Who wrote 'The Hanging Tree' song?",
    "Which District was the source of the Snow family's wealth before the war?",
    "Which district did the Plinth family call home before moving to the Capitol?",
    "What is Lucy Gray wearing when she first appears in 'The Ballad of Songbirds and Snakes'?",
    "Where are the tributes sent immediately after they arrive in the Capitol in 'The Ballad of Songbirds and Snakes'?",
    "What does Coriolanus do to keep Lucy Gray safe when he realizes Dr. Gaul is putting her hybrid snakes in the arena in 'The Ballad of Songbirds and Snakes'?",
    "What does Coriolanus do that causes Sejanus's execution in 'The Ballad of Songbirds and Snakes'?",
    "What is one of the main reasons Coriolanus is chosen to sing the national anthem at Arachne's funeral in 'The Ballad of Songbirds and Snakes'?",
    "Who arrives at the District 12 Peacekeeper base soon after Coriolanus in 'The Ballad of Songbirds and Snakes'?"
]
ground_truths = [
    ["Lucy Gray Baird from District 12."],
    ["District 13."],
    ["District 2."],
    ["A ruffled dress in rainbow colors."],
    ["The monkey house in the zoo."],
    ["He drops a handkerchief with Lucy Gray's scent into the snake tank."],
    ["He uses a jabberjay to record Sejanus talking about his part in a rebel plan."],
    ["He knows all the words."],
    ["Sejanus."]
]

In [32]:
questions = [
    "Where did the jabberjays originate in 'The Ballad of Songbirds and Snakes'?",
    "Why does Coriolanus feel he needs to kill Lucy Gray?",
    "What does Coriolanus do to keep Lucy Gray safe when he realizes Dr. Gaul is putting her hybrid snakes in the arena?",
    "Coriolanus regularly receives a box from Mrs. Plinth in 'The Ballad of Songbirds and Snakes'. What is in the box?",
    "What is Sejanus sprinkling on Marcus's body in the arena in 'The Ballad of Songbirds and Snakes'?",
    "What does Coriolanus wear in his lapel at the interview to remind everyone that Lucy Gray 'belongs to' him in 'The Ballad of Songbirds and Snakes'?"
]
ground_truths = [
    ["In Dr. Gaul's lab."],
    ["She can tie him to Mayfair's murder."],
    ["Drops a handkerchief with Lucy Gray's scent into the snake tank."],
    ["Baked goods."],
    ["Breadcrumbs so he will have food to eat during his journey."],
    ["A rose that matches the one in her hair."]
]

In [23]:
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever
compressor = CohereRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=ensemble_retriever_3
)

result_compression_1 = change_chunk_size(compression_retriever)
print("Reranker")
print(result_compression_1)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 54/54 [00:06<00:00,  8.97it/s]


Reranker
{'faithfulness': 0.8333, 'answer_relevancy': 0.8419, 'context_precision': 0.7593, 'context_recall': 0.5370, 'answer_similarity': 0.8132, 'answer_correctness': 0.3978}


In [25]:
result_compression_2 = change_chunk_size(compression_retriever)
print("Reranker")
print(result_compression_2)



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/36 [00:00<?, ?it/s]Runner in Executor raised an exception
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 58, in _aresults
    r = await future
        ^^^^^^^^^^^^
  File "C:\Users\sigitalapina\AppData\Local\Programs\Python\Python311\Lib\asyncio\tasks.py", line 605, in _wait_for_one
    return f.result()  # May raise f.exception().
           ^^^^^^^^^^
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 91, in wrapped_callable_async
    return counter, await callable(*args, **kwargs)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sigitalapina\OneDriv

Reranker
{'faithfulness': 0.7000, 'answer_relevancy': 0.1810, 'context_precision': 0.8199, 'context_recall': 0.3550, 'answer_similarity': 0.6015, 'answer_correctness': 0.4136}


In [26]:
df_1=result_compression_1.to_pandas
df_1

<bound method Result.to_pandas of {'faithfulness': 0.8333, 'answer_relevancy': 0.8419, 'context_precision': 0.7593, 'context_recall': 0.5370, 'answer_similarity': 0.8132, 'answer_correctness': 0.3978}>

In [27]:
df_2=result_compression_2.to_pandas
df_2

<bound method Result.to_pandas of {'faithfulness': 0.7000, 'answer_relevancy': 0.1810, 'context_precision': 0.8199, 'context_recall': 0.3550, 'answer_similarity': 0.6015, 'answer_correctness': 0.4136}>

In [56]:
results_df

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.083333,0.810294,,,0.793104,0.256609,0.390557
1,GPT-4,0.666667,0.887922,,,0.801728,0.287099,0.507236
2,Naive,0.558016,0.659556,0.787525,0.514306,0.280777,0.529137,0.554886
3,Recursive,0.379104,0.744159,0.721819,0.383333,0.288185,0.67437,0.531828
4,"Chunk 500, overlap 0%",0.517857,0.088874,0.78231,0.277026,0.513533,0.580632,0.460039
5,"Chunk 500, overlap 5%",0.565476,0.218776,0.801542,0.346715,0.585408,0.657749,0.529278
6,"Chunk 500, overlap 10%",0.652778,0.230681,0.793275,0.326791,0.746519,0.667383,0.569571
7,"Chunk 500, overlap 15%",0.841746,0.535714,0.107143,0.812384,0.341578,0.578127,0.536115
8,"Chunk 500, overlap 20%",0.63609,0.486717,0.237498,0.684846,0.471942,0.315999,0.472182
9,"Chunk 1000, overlap 0%",0.364984,0.852743,0.731757,0.37963,0.288185,0.671611,0.548152


In [57]:
top_3_highest = results_df.nlargest(3, "Average")
top_3_highest

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
35,Ensambler 3,0.614624,0.72338,0.801633,0.402776,0.611634,0.621585,0.629272
34,Ensambler 2,0.621946,0.429459,0.584322,0.682494,0.625598,0.577126,0.586824
33,Ensambler 1,0.478271,0.703551,0.671269,0.673675,0.566497,0.389093,0.580393


#### creating context by remaking the query

In [58]:
template_context = "Generate a search query to fetch the relevant documents using the user's {question}. Craft a query that specifically targets the keywords in the question. In the answer provide only the query."
prompt_context = ChatPromptTemplate.from_template(template_context)

In [59]:
answers_final = []
contexts_final = []
llm_for_context =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt_context | llm}
)
for query in questions:
    response_check = llm_for_context.invoke({"question": query})
    search_query = response_check["response"].content
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("context"), "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
)
    docs = ensemble_retriever_3.get_relevant_documents(search_query)
    formatted_docs = []
    for doc in docs:
        resulting_doc = doc.page_content
        formatted_docs.append(resulting_doc)
    try:  
            response = retrieval_augmented_qa_chain.invoke({"context": formatted_docs, "question": query})
            # Access the response content
            answers_final.append(response["response"].content)
            contexts_final.append(formatted_docs)  
    except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_final.append("No answer")
            contexts_final.append(formatted_docs)

In [60]:
result_search_query = evaluation_rag(questions, answers_final, contexts_final, ground_truths)
result_search_query

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

{'faithfulness': 0.5932, 'answer_relevancy': 0.5870, 'context_precision': 0.4648, 'context_recall': 0.3916, 'answer_similarity': 0.7860, 'answer_correctness': 0.3464}

In [61]:
average = dictionary(result_search_query)
    # Create a dictionary to store the results
system_results = {
        "System": "Search query",
        "Faithfulness": result_search_query["faithfulness"],
        "Answer Relevancy": result_search_query["answer_relevancy"],
        "Context Precision": result_search_query["context_precision"],
        "Context Recall": result_search_query["context_recall"],
        "Answer Similarity": result_search_query["answer_similarity"],
        "Answer Correctness": result_search_query["answer_correctness"],
        "Average": average
    }
df_result_search_query = pd.DataFrame([system_results])
results_df = pd.concat([results_df, df_result_search_query], ignore_index=True)
results_df

The average score is: 0.528162961943606


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,GPT-3.5,0.083333,0.810294,,,0.793104,0.256609,0.390557
1,GPT-4,0.666667,0.887922,,,0.801728,0.287099,0.507236
2,Naive,0.558016,0.659556,0.787525,0.514306,0.280777,0.529137,0.554886
3,Recursive,0.379104,0.744159,0.721819,0.383333,0.288185,0.67437,0.531828
4,"Chunk 500, overlap 0%",0.517857,0.088874,0.78231,0.277026,0.513533,0.580632,0.460039
5,"Chunk 500, overlap 5%",0.565476,0.218776,0.801542,0.346715,0.585408,0.657749,0.529278
6,"Chunk 500, overlap 10%",0.652778,0.230681,0.793275,0.326791,0.746519,0.667383,0.569571
7,"Chunk 500, overlap 15%",0.841746,0.535714,0.107143,0.812384,0.341578,0.578127,0.536115
8,"Chunk 500, overlap 20%",0.63609,0.486717,0.237498,0.684846,0.471942,0.315999,0.472182
9,"Chunk 1000, overlap 0%",0.364984,0.852743,0.731757,0.37963,0.288185,0.671611,0.548152


### change model to GPT-4

In [62]:
top_3_highest = results_df.nlargest(3, "Average")
top_3_highest

Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
35,Ensambler 3,0.614624,0.72338,0.801633,0.402776,0.611634,0.621585,0.629272
34,Ensambler 2,0.621946,0.429459,0.584322,0.682494,0.625598,0.577126,0.586824
33,Ensambler 1,0.478271,0.703551,0.671269,0.673675,0.566497,0.389093,0.580393


In [63]:
result_ensemble3_gpt4, results_df = run_and_evaluate(f"Ensambler 3, GPT-4", ensemble_retriever_3, prompt, llm_gpt4, results_df)
result_ensemble3_gpt4



passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'm

The average score is: 0.6301538266175434
This is the new best value!


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Ensambler 3, GPT-4",0.703828,0.777438,0.664803,0.405762,0.583367,0.645725,0.630154


In [64]:
result_ensemble2_gpt4, results_df = run_and_evaluate(f"Ensambler 2, GPT-4", ensemble_retriever_2, prompt, llm_gpt4, results_df)
result_ensemble2_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

The average score is: 0.5801561033439621


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Ensambler 2, GPT-4",0.611953,0.700445,0.549128,0.66728,0.484443,0.467687,0.580156


In [65]:
result_ensemble1_gpt4, results_df = run_and_evaluate(f"Ensambler 1, GPT-4", ensemble_retriever_1, prompt, llm_gpt4, results_df)
result_ensemble1_gpt4

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
openai.BadRequestError: Error code: 400 - {'error': {'message': "The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766", 'type': None, 'param': 'prompt', 'code': 'content_filter', 'status': 400, 'innererror': {'code': 'ResponsibleAIPolicyViolation', 'content_filter_result': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'low'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': True, 'severity': 'me

The average score is: 0.5811690432381572


Unnamed: 0,System,Faithfulness,Answer Relevancy,Context Precision,Context Recall,Answer Similarity,Answer Correctness,Average
0,"Ensambler 1, GPT-4",0.541574,0.33571,0.596699,0.712009,0.716593,0.58443,0.581169


In [66]:
results_df.to_csv(f"../ballad/results/results_qa.csv", index=False)