In [1]:
!pip install pypdf



In [2]:
import dotenv
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_openai import AzureOpenAIEmbeddings
dotenv.load_dotenv()
from langchain_community.vectorstores import Chroma
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_similarity,
    answer_correctness,
)
from datasets import Dataset
from langchain_openai import AzureChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
import os
from langchain_community.document_loaders import PyPDFLoader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_VERSION = os.environ.get("OPENAI_API_VERSION")
AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL")
OPENAI_DEPLOYMENT = os.environ.get("OPENAI_DEPLOYMENT")
EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL")
EMBEDDING_DEPLOYMENT = os.environ.get("EMBEDDING_DEPLOYMENT")
OPENAI_MODEL_GPT4 = os.environ.get("OPENAI_MODEL_GPT4")
OPENAI_DEPLOYMENT_GPT4 = os.environ.get("OPENAI_DEPLOYMENT_GPT4")

In [4]:
questions = [
    "Who wrote 'The Hanging Tree' song?",
    "Which District was the source of the Snow family's wealth before the war?",
    "Which district did the Plinth family call home before moving to the Capitol?",
    "What is Lucy Gray wearing when she first appears in 'The Ballad of Songbirds and Snakes'?",
    "Where are the tributes sent immediately after they arrive in the Capitol in 'The Ballad of Songbirds and Snakes'?",
    "What does Coriolanus do to keep Lucy Gray safe when he realizes Dr. Gaul is putting her hybrid snakes in the arena in 'The Ballad of Songbirds and Snakes'?",
    "What does Coriolanus do that causes Sejanus's execution in 'The Ballad of Songbirds and Snakes'?",
    "What is one of the main reasons Coriolanus is chosen to sing the national anthem at Arachne's funeral in 'The Ballad of Songbirds and Snakes'?",
    "Who arrives at the District 12 Peacekeeper base soon after Coriolanus in 'The Ballad of Songbirds and Snakes'?",
    "Where did the jabberjays originate in 'The Ballad of Songbirds and Snakes'?",
    "Why does Coriolanus feel he needs to kill Lucy Gray?",
    "What does Coriolanus do to keep Lucy Gray safe when he realizes Dr. Gaul is putting her hybrid snakes in the arena?",
    "Coriolanus regularly receives a box from Mrs. Plinth in 'The Ballad of Songbirds and Snakes'. What is in the box?",
    "What is Sejanus sprinkling on Marcus's body in the arena in 'The Ballad of Songbirds and Snakes'?",
    "What does Coriolanus wear in his lapel at the interview to remind everyone that Lucy Gray 'belongs to' him in 'The Ballad of Songbirds and Snakes'?"
]
ground_truths = [
    ["Lucy Gray Baird from District 12."],
    ["District 13."],
    ["District 2."],
    ["A ruffled dress in rainbow colors."],
    ["The monkey house in the zoo."],
    ["He drops a handkerchief with Lucy Gray's scent into the snake tank."],
    ["He uses a jabberjay to record Sejanus talking about his part in a rebel plan."],
    ["He knows all the words."],
    ["Sejanus."],
    ["In Dr. Gaul's lab."],
    ["She can tie him to Mayfair's murder."],
    ["Drops a handkerchief with Lucy Gray's scent into the snake tank."],
    ["Baked goods."],
    ["Breadcrumbs so he will have food to eat during his journey."],
    ["A rose that matches the one in her hair."]
]
answers_llm = []
contexts_llm = [[""],[""],[""],[""],[""],[""],[""],[""],[""],[""],[""],[""],[""],[""],[""]]

In [18]:
embeddings_client = AzureOpenAIEmbeddings(
    azure_deployment=EMBEDDING_DEPLOYMENT,
    openai_api_version=OPENAI_API_VERSION)
llm = AzureChatOpenAI(model_name=OPENAI_MODEL, azure_deployment=OPENAI_DEPLOYMENT,temperature=0)
llm_gpt4 = AzureChatOpenAI(model_name=OPENAI_MODEL_GPT4, azure_deployment=OPENAI_DEPLOYMENT_GPT4,temperature=0)

In [23]:
def evaluation_llm(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
    )
    return result

In [6]:
def evaluation_rag(questions, answers, contexts, ground_truths):
    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truths": ground_truths
    }
    dataset = Dataset.from_dict(data)
    azure_configs = {
        "base_url": AZURE_OPENAI_ENDPOINT,
        "model_deployment": OPENAI_DEPLOYMENT,
        "model_name": OPENAI_MODEL,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "embedding_name": EMBEDDING_MODEL,  # most likely
    }

    azure_model = AzureChatOpenAI(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["model_deployment"],
        model=azure_configs["model_name"],
        validate_base_url=False,
    )

    azure_embeddings = AzureOpenAIEmbeddings(
        openai_api_version=OPENAI_API_VERSION,
        azure_endpoint=azure_configs["base_url"],
        azure_deployment=azure_configs["embedding_deployment"],
        model=azure_configs["embedding_name"],
    )
    result = evaluate(
        dataset = dataset, 
        metrics=[
            faithfulness,
            answer_relevancy,
            context_precision,
            context_recall,
            answer_similarity,
            answer_correctness,
        ], 
        llm=azure_model, 
        embeddings=azure_embeddings,
        raise_exceptions=False,
    )
    return result

### General answers by LLM

In [19]:
template = """Write a concise answer to the following question: {question}"""
prompt = ChatPromptTemplate.from_template(template)

In [20]:
llm_chain =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm}
)
llm_chain_gpt4 =(
    { "question": itemgetter("question")}
    | RunnablePassthrough()
    | {"response": prompt | llm_gpt4}
)

In [26]:
for query in questions:
    response = llm_chain.invoke({"question": query})
    answers_llm.append(response["response"].content)

In [27]:
llm_results = evaluation_llm(questions, answers_llm, contexts_llm, ground_truths)
print(llm_results)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:04<00:00, 12.92it/s]


{'faithfulness': 0.0833, 'answer_relevancy': 0.8096, 'answer_similarity': 0.7944, 'answer_correctness': 0.2569}


In [24]:
answers_llm_gpt4 = []
for query in questions:
    response = llm_chain_gpt4.invoke({"question": query})
    answers_llm_gpt4.append(response["response"].content)

In [25]:
llm_results_gpt4 = evaluation_llm(questions, answers_llm_gpt4, contexts_llm, ground_truths)
print(llm_results_gpt4)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 60/60 [00:07<00:00,  8.17it/s]


{'faithfulness': 0.0833, 'answer_relevancy': 0.8099, 'answer_similarity': 0.7931, 'answer_correctness': 0.2566}


### Naive RAG

In [7]:
loader = PyPDFLoader(r"..\ballad\the_ballad_of_songbirds_and_snakes.pdf")
documents = loader.load()

In [8]:
text_splitter = CharacterTextSplitter()
chunks = text_splitter.split_documents(documents)
db_naive = Chroma.from_documents(chunks, embeddings_client, persist_directory = "../ballad/vectordb/naive")
retriever_naive = db_naive.as_retriever()

In [10]:
template = """Write a concise answer to the following question: {question}. 
Context {context}."""
prompt = ChatPromptTemplate.from_template(template)

In [11]:
retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | retriever_naive, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | llm, "context": itemgetter("context")}
)

In [12]:
answers_naive = []
contexts_naive = []
for query in questions:
    try:  
        response = retrieval_augmented_qa_chain.invoke({"question": query})
        # Access the response content
        answers_naive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_naive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_naive.append("No answer")
        context_full = retriever_naive.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_naive.append(context_content)



In [13]:
result_naive_rag = evaluation_rag(questions, answers_naive, contexts_naive, ground_truths)
print(result_naive_rag)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   1%|          | 1/90 [00:05<08:04,  5.45s/it]Runner in Executor raised an exception
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 58, in _aresults
    r = await future
        ^^^^^^^^^^^^
  File "C:\Users\sigitalapina\AppData\Local\Programs\Python\Python311\Lib\asyncio\tasks.py", line 605, in _wait_for_one
    return f.result()  # May raise f.exception().
           ^^^^^^^^^^
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 91, in wrapped_callable_async
    return counter, await callable(*args, **kwargs)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sigitalapina

{'faithfulness': 0.3652, 'answer_relevancy': 0.7389, 'context_precision': 0.6376, 'context_recall': 0.4407, 'answer_similarity': 0.3332, 'answer_correctness': 0.6345}


## try recursive text splitter

In [14]:
text_splitter = text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder()
chunks_r = text_splitter.split_documents(documents)
db_basic = Chroma.from_documents(chunks_r, embeddings_client, persist_directory = "../ballad/vectordb/recursive_basic")
retriever_basic = db_basic.as_retriever()

In [15]:
retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | retriever_basic, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | llm, "context": itemgetter("context")}
)

In [16]:
answers_recursive = []
contexts_recursive = []
for query in questions:
    try:  
        response = retrieval_augmented_qa_chain.invoke({"question": query})
        # Access the response content
        answers_recursive.append(response["response"].content)
        # Access the context content
        context_content = [context.page_content for context in response["context"]]
        contexts_recursive.append(context_content)  
    except Exception as e:  
        print(f"Warning: {e}" + "on the following question: " + query)  
        answers_recursive.append("No answer")
        context_full = retriever_basic.get_relevant_documents(query)
        context_content = [context.page_content for context in context_full]
        contexts_recursive.append(context_content)

In [17]:
result_recursive = evaluation_rag(questions, answers_recursive, contexts_recursive, ground_truths)
print(result_recursive)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  22%|██▏       | 20/90 [00:05<00:11,  6.08it/s]Invalid JSON response. Expected dictionary with key 'question'
Evaluating: 100%|██████████| 90/90 [00:12<00:00,  7.43it/s]


{'faithfulness': 0.5926, 'answer_relevancy': 0.7985, 'context_precision': 0.4074, 'context_recall': 0.3000, 'answer_similarity': 0.7997, 'answer_correctness': 0.3183}


## chunk size change

In [28]:
def change_chunk_size(retriever):
    retrieval_augmented_qa_chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))
        | {"response": prompt | llm, "context": itemgetter("context")}
    )
    answers_recursive = []
    contexts_recursive = []

    for query in questions:
        try:  
            response = retrieval_augmented_qa_chain.invoke({"question": query})
            # Access the response content
            answers_recursive.append(response["response"].content)
            # Access the context content
            context_content = [context.page_content for context in response["context"]]
            contexts_recursive.append(context_content)  
        except Exception as e:  
            print(f"Warning: {e}" + "on the following question: " + query)  
            answers_recursive.append("No answer")
            context_full = retriever.get_relevant_documents(query)
            context_content = [context.page_content for context in context_full]
            contexts_recursive.append(context_content)


    result = evaluation_rag(questions, answers_recursive, contexts_recursive, ground_truths)
    return result

In [29]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 100)
chunks_1000 = text_splitter.split_documents(documents)
print(len(chunks_1000))
db_1000 = Chroma.from_documents(chunks_1000, embeddings_client, persist_directory = "../ballad/vectordb/recursive_1000")
retriever_1000 = db_1000.as_retriever()
result_1000 = change_chunk_size(retriever_1000)
print("CHUNK SIZE 1000")
print(result_1000)

413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 90/90 [00:12<00:00,  7.33it/s]


CHUNK SIZE 1000
{'faithfulness': 0.1042, 'answer_relevancy': 0.8262, 'context_precision': 0.3741, 'context_recall': 0.2667, 'answer_similarity': 0.7931, 'answer_correctness': 0.2233}


In [30]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500, chunk_overlap = 50)
chunks_500 = text_splitter.split_documents(documents)
print(len(chunks_500))
db_500 = Chroma.from_documents(chunks_500, embeddings_client, persist_directory = "../ballad/vectordb/recursive_500")
retriever_500 = db_500.as_retriever()
result_500 = change_chunk_size(retriever_500)
print("CHUNK SIZE 500")
print(result_500)

735


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:  86%|████████▌ | 77/90 [00:07<00:00, 24.61it/s]Runner in Executor raised an exception
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 58, in _aresults
    r = await future
        ^^^^^^^^^^^^
  File "C:\Users\sigitalapina\AppData\Local\Programs\Python\Python311\Lib\asyncio\tasks.py", line 605, in _wait_for_one
    return f.result()  # May raise f.exception().
           ^^^^^^^^^^
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 91, in wrapped_callable_async
    return counter, await callable(*args, **kwargs)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sigitalapin

CHUNK SIZE 500
{'faithfulness': 0.2821, 'answer_relevancy': 0.0836, 'context_precision': 0.8242, 'context_recall': 0.6093, 'answer_similarity': 0.2066, 'answer_correctness': 0.7132}


In [31]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 2000, chunk_overlap = 200)
chunks_2000 = text_splitter.split_documents(documents)
print(len(chunks_2000))
db_2000 = Chroma.from_documents(chunks_2000, embeddings_client, persist_directory = "../ballad/vectordb/recursive_2000")
retriever_2000 = db_2000.as_retriever()
result_2000 = change_chunk_size(retriever_2000)
print("CHUNK SIZE 2000")
print(result_2000)

413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating: 100%|██████████| 90/90 [00:13<00:00,  6.65it/s]


CHUNK SIZE 2000
{'faithfulness': 0.1042, 'answer_relevancy': 0.8306, 'context_precision': 0.3796, 'context_recall': 0.3000, 'answer_similarity': 0.7944, 'answer_correctness': 0.2236}


In [32]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 3000, chunk_overlap = 300)
chunks_3000 = text_splitter.split_documents(documents)
print(len(chunks_3000))
db_3000 = Chroma.from_documents(chunks_3000, embeddings_client, persist_directory = "../ballad/vectordb/recursive_3000")
retriever_3000 = db_3000.as_retriever()
result_3000 = change_chunk_size(retriever_3000)
print("CHUNK SIZE 3000")
print(result_3000)

413


passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 58, in _aresults
    r = await future
        ^^^^^^^^^^^^
  File "C:\Users\sigitalapina\AppData\Local\Programs\Python\Python311\Lib\asyncio\tasks.py", line 605, in _wait_for_one
    return f.result()  # May raise f.exception().
           ^^^^^^^^^^
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 91, in wrapped_callable_async
    return counter, await callable(*args, **kwargs)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sigitalapina\OneDriv

CHUNK SIZE 3000
{'faithfulness': 0.3045, 'answer_relevancy': 0.2429, 'context_precision': 0.8152, 'context_recall': 0.3833, 'answer_similarity': 0.2895, 'answer_correctness': 0.6760}


### now time to look for different top-k

Note: We continue with the size chunk of 500 as it had the highest average score

In [33]:
retriever_3 = db_500.as_retriever(search_kwargs={"k": 3})
result_3 = change_chunk_size(retriever_3)
print("CHUNK SIZE 1000, K=3")
print(result_3)

passing column names as 'ground_truths' is deprecated and will be removed in the next version, please use 'ground_truth' instead. Note that `ground_truth` should be of type string and not Sequence[string] like `ground_truths`
Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]Runner in Executor raised an exception
Traceback (most recent call last):
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 58, in _aresults
    r = await future
        ^^^^^^^^^^^^
  File "C:\Users\sigitalapina\AppData\Local\Programs\Python\Python311\Lib\asyncio\tasks.py", line 605, in _wait_for_one
    return f.result()  # May raise f.exception().
           ^^^^^^^^^^
  File "c:\Users\sigitalapina\OneDrive - KPMG\Desktop\thesis-rag\.venv\Lib\site-packages\ragas\executor.py", line 91, in wrapped_callable_async
    return counter, await callable(*args, **kwargs)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\sigitalapina\OneDriv

In [None]:
retriever_5 = db_500.as_retriever(search_kwargs={"k": 5})
result_5 = change_chunk_size(retriever_5)
print("CHUNK SIZE 1000, K=5")
print(result_5)

In [None]:
retriever_6 = db_500.as_retriever(search_kwargs={"k": 6})
result_6 = change_chunk_size(retriever_6)
print("CHUNK SIZE 1000, K=5")
print(result_6)