In [34]:
from langchain.storage import InMemoryByteStore
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever
from typing import Iterable
from langchain_core.documents import Document
import json

## Read chunked dataset and raw dataset

In [35]:
def load_docs_from_jsonl(file_path)->Iterable[Document]:
    array = []
    with open(file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            array.append(obj)
    return array

chunked_documents=load_docs_from_jsonl('../data/processed_dataset/chunked_documents.jsonl')
raw_documents=load_docs_from_jsonl('../data/processed_dataset/raw_documents.jsonl')

In [36]:
# check data contents
chunked_documents[0:5]

[Document(metadata={'title': 'Booking Confirmation and Ticket Instructions', 'location': 'Tromsø, Norway', 'type': 'activity', 'source': 'data/aurora_borealis_tour.pdf', 'doc_id': 0, 'chunk_id': 0}, page_content='Booking Confirmation and Ticket Instructions:Thanks for your order, Nadine. For easy access to your ticket and to be able to manage your booking on the go, download our app. November 30, 2023, at 6:30 PM From Tromsø: Aurora Borealis Tour 4 Adults (Age 0 - 99) • English • 6 hours € 555. See activity details. Get the GetYourGuide app to access your activity. Access your activity with the GetYourGuide app. Just open your tickets in the app and you’ll be ready to go.'),
 Document(metadata={'title': 'Important Information and Meeting Instructions', 'location': 'Tromsø, Norway', 'type': 'activity', 'source': 'data/aurora_borealis_tour.pdf', 'doc_id': 0, 'chunk_id': 1}, page_content="Important Information and Meeting Instructions:Where to go: Fredrik Langes gate 2, 9008 Tromsø, Norwa

In [37]:
raw_documents[0:5]

[Document(metadata={'source': 'data/aurora_borealis_tour.pdf', 'doc_id': 0}, page_content="Fwd: Booking RM6K4FLS confirmed | Ticket instructions\n1 message\nFrom: GetYourGuide\n To: NadineTue, Sep 12, 2023 at 4:54 PM\nThanks for your order,\nNadine\nFor easy access to your ticket and to be able to\nmanage your booking on the go, download our\napp.\nNovember 30, 2023 at 6:30 PM\nFrom Tromsø: Aurora\nBorealis Tour\n4 Adults (Age 0 - 99) • English • 6 hours\n€ 555\nSee activity details\nGet the GetYourGuide app to access your\nactivity\nAccess your activity with the GetYourGuide app. Just\nopen your tickets in the app and you’ll be ready to go.What to do on the day\nWhere to go Fredrik Langes gate 2, 9008 Tromsø,\nNorway Meet your guide outside the Scandic Ishavshotel.\nWhen to arrive\n6:20 PM - Arrive at the meeting point 10 minutes before \nyour chosen time so you don't lose your time slot.\nWhere your activity ends\nYour activity will end at the same place it began.\nImportant informat

## Create vectorestore - ChromaDB

In [38]:
# use OpenAI Embeddings
embeddings = "text-embedding-3-small"

# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=OpenAIEmbeddings(model=embeddings)
)

## MultiVectoreRetriever
- this will embed the small chunks but retrieve the information from the parent document

In [39]:
# The storage layer for the parent documents
store = InMemoryByteStore()

# key to match child document(chunks) to parent document (raw)
id_key = "doc_id"

# create MultiVectorRetriever
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)

doc_ids = [doc.metadata['doc_id'] for doc in raw_documents]
retriever.vectorstore.add_documents(chunked_documents)
retriever.docstore.mset(list(zip(doc_ids, raw_documents)))

In [40]:
# cosine similarity
retriever.vectorstore.similarity_search('Booking Details for Aurora Borealis Tour')

[Document(metadata={'chunk_id': 0, 'doc_id': 0, 'location': 'Tromsø, Norway', 'source': 'data/aurora_borealis_tour.pdf', 'title': 'Booking Confirmation and Ticket Instructions', 'type': 'activity'}, page_content='Booking Confirmation and Ticket Instructions:Thanks for your order, Nadine. For easy access to your ticket and to be able to manage your booking on the go, download our app. November 30, 2023, at 6:30 PM From Tromsø: Aurora Borealis Tour 4 Adults (Age 0 - 99) • English • 6 hours € 555. See activity details. Get the GetYourGuide app to access your activity. Access your activity with the GetYourGuide app. Just open your tickets in the app and you’ll be ready to go.'),
 Document(metadata={'chunk_id': 0, 'doc_id': 0, 'location': 'Tromsø, Norway', 'source': 'data/aurora_borealis_tour.pdf', 'title': 'Booking Confirmation and Ticket Instructions', 'type': 'activity'}, page_content='Booking Confirmation and Ticket Instructions:Thanks for your order, Nadine. For easy access to your tic

## Retrieval Evaluation

In [41]:
import pandas as pd
df_ground_truth = pd.read_csv('../data/ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [42]:
def compute_mrr(ranks):
    """
    Computes the Mean Reciprocal Rank (MRR) given a list of ranks.

    Args:
        ranks (list): List of ranks where each rank is the position of the relevant item.
                      A rank of 0 indicates the relevant item was not found.

    Returns:
        float: The Mean Reciprocal Rank.
    """
    reciprocal_ranks = [1 / rank if rank > 0 else 0 for rank in ranks]
    return sum(reciprocal_ranks) / len(reciprocal_ranks)


In [43]:
def compute_hits_and_ranks(relevant_item, retrieved_items):
    """
    Computes the hit and rank for a single query.

    Args:
        relevant_item: The relevant item (could be doc_id or (doc_id, chunk_id)).
        retrieved_items (list): List of retrieved items.

    Returns:
        tuple: (hit, rank)
            - hit (bool): True if the relevant item is in retrieved_items.
            - rank (int): Rank position of the relevant item (1-based). 0 if not found.
    """
    hit = relevant_item in retrieved_items
    try:
        rank = retrieved_items.index(relevant_item) + 1  # Ranks start from 1
    except ValueError:
        rank = 0  # Relevant item not found
    return hit, rank

In [44]:
def evaluate_query(ground_truth, retriever, k=5):
    """
    Evaluates the retriever's performance for a single ground truth item.

    Args:
        item (dict): A dictionary containing:
            - 'question': The query string.
            - 'doc_id': The relevant doc_id.
            - 'chunk_id': The relevant chunk_id.
        retriever: The retriever object.
        k (int): Number of top documents to consider.

    Returns:
        dict: Dictionary containing hits and ranks for both doc_id and doc_chunk.
    """
    query = ground_truth['question']
    relevant_doc_id = ground_truth['doc_id']
    relevant_chunk_id = ground_truth['chunk_id']
    relevant_doc_chunk = (relevant_doc_id, relevant_chunk_id)

    # Retrieve documents for the query
    results = retriever.vectorstore.similarity_search(query,k=k)

    # Extract retrieved doc_ids and chunks
    retrieved_doc_ids = [doc.metadata['doc_id'] for doc in results[:k]]
    retrieved_doc_chunks = [
        (doc.metadata['doc_id'], doc.metadata['chunk_id']) for doc in results[:k]
    ]

    # Compute hit and rank for doc_id
    hit_doc_id, rank_doc_id = compute_hits_and_ranks(relevant_doc_id, retrieved_doc_ids)

    # Compute hit and rank for doc_chunk
    hit_doc_chunk, rank_doc_chunk = compute_hits_and_ranks(relevant_doc_chunk, retrieved_doc_chunks)

    return {
        'hit_doc_id': hit_doc_id,
        'rank_doc_id': rank_doc_id,
        'hit_doc_chunk': hit_doc_chunk,
        'rank_doc_chunk': rank_doc_chunk
    }

In [45]:
from tqdm import tqdm

# Initialize lists to collect results
relevance_total_doc_id = []
relevance_total_doc_chunk = []
ranks_doc_id = []
ranks_doc_chunk = []

# Loop over all ground truth items
for item in tqdm(ground_truth):
    metrics = evaluate_query(item, retriever, k=5)
    relevance_total_doc_id.append(metrics['hit_doc_id'])
    ranks_doc_id.append(metrics['rank_doc_id'])
    relevance_total_doc_chunk.append(metrics['hit_doc_chunk'])
    ranks_doc_chunk.append(metrics['rank_doc_chunk'])

  0%|          | 0/235 [00:00<?, ?it/s]

 71%|███████   | 167/235 [00:51<00:18,  3.70it/s]

In [47]:
# Compute overall metrics
hit_rate_doc_id = sum(relevance_total_doc_id) / len(relevance_total_doc_id) if relevance_total_doc_id else 0
hit_rate_doc_chunk = sum(relevance_total_doc_chunk) / len(relevance_total_doc_chunk) if relevance_total_doc_chunk else 0
mrr_doc_id = compute_mrr(ranks_doc_id)
mrr_doc_chunk = compute_mrr(ranks_doc_chunk)

# Print the results
print(f"Hit Rate@{5} (doc_id only): {hit_rate_doc_id}")
print(f"Hit Rate@{5} (doc_id and chunk_id): {hit_rate_doc_chunk}")
print(f"MRR (doc_id only): {mrr_doc_id}")
print(f"MRR (doc_id and chunk_id): {mrr_doc_chunk}")

Hit Rate@5 (doc_id only): 0.9914893617021276
Hit Rate@5 (doc_id and chunk_id): 0.9446808510638298
MRR (doc_id only): 0.8753191489361701
MRR (doc_id and chunk_id): 0.7936879432624115


## RAG Flow

In [21]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI
import os
load_dotenv()
GROQ_API_KEY = os.getenv('GROQ_API_KEY')

def get_llm(llm_type:str):
    if llm_type == 'groq':
        return ChatGroq(groq_api_key=GROQ_API_KEY, model_name="llama3-8b-8192")
    elif llm_type == 'openai':
        return ChatOpenAI(model='gpt-4o-mini')

def rag(input:str, retriever):
    # Define the system prompt
    system_prompt = """
    You are a travel assistant that helps people extract relevant information from documents 
    and assist in the travel questions. Based on the provided context, please answer the following question.

    Context: {context}

    Question: {input}

    Answer:
    """
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}"),
        ]
    )


    # create chain to pass the douments to the prompt
    chain = create_stuff_documents_chain(get_llm('groq'), prompt)

    # chain to retrieve the documents 
    rag_chain = create_retrieval_chain(retriever, chain)
    result = rag_chain.invoke({"input":input})
    return result['answer']


In [20]:
rag("What is the date and time of my flight from Tromso?",retriever)

'According to the provided context, your flight from Tromso is as follows:\n\n* Flight number: SK 1234\n* Date: November 30, 2023\n* Departure time: 10:50\n* Arrival time: 12:50'

## RAG Evaluation

> This evaluation is something that I still need to improve. <br/>
The high relevance can be due to the fact that all the files are related to bookings. Hence, the answers could be relevant. <br/>
I would need to evaluate the precision and compare it to the actual document

In [15]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
llm_rag_eval = ChatOpenAI(model='gpt-4o-mini')

# create PromptTemplate
prompt = PromptTemplate(input_variables=['question','answer_llm'],template=rag_eval_prompt_template)

    

In [8]:
prompt

PromptTemplate(input_variables=['answer_llm', 'question'], template='You are an expert evaluator for a RAG system.\n    Your task is to analyze the relevance of the generated answer to the given question.\n    Based on the relevance of the generated answer, you will classify it\n    as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".\n    \n    Here is the data for evaluation:\n    Question: {question}\n    Generated Answer: {answer_llm}\n    \n    Please analyze the content and context of the generated answer in relation to the question\n    and provide your evaluation in parsable JSON without using code blocks:\n    {{\n      "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",\n      "Explanation": "[Provide a brief explanation for your evaluation]"\n    }}')

In [17]:
from langchain_core.output_parsers import StrOutputParser
def eval_rag(question, answer_llm,llm) -> list:
    # Define the prompt template
    rag_eval_prompt_template = """
    You are an expert evaluator for a RAG system.
    Your task is to analyze the relevance of the generated answer to the given question.
    Based on the relevance of the generated answer, you will classify it
    as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".
    
    Here is the data for evaluation:
    Question: {question}
    Generated Answer: {answer_llm}
    
    Please analyze the content and context of the generated answer in relation to the question
    and provide your evaluation in parsable JSON without using code blocks:
    {{
      "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
      "Explanation": "[Provide a brief explanation for your evaluation]"
    }}""".strip()

    # Create the prompt template with LangChain
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", rag_eval_prompt_template),
        ]
    )
    parser = StrOutputParser()

    chain = prompt | llm | parser
    # Create the LLM chain

    # Execute the chain with the provided context
    result = chain.invoke({"question": question, "answer_llm":answer_llm})
    
    return result

In [11]:
ground_truth[0]['question']
answer_llm = rag(ground_truth[0]['question'],retriever)

In [26]:
from tqdm import tqdm
evaluations = []

for record in tqdm(ground_truth):
    question = record['question']
    answer_llm = rag(question, retriever) 

    evaluation = eval_rag(question, answer_llm, get_llm('openai'))
    evaluation = json.loads(evaluation)

    evaluations.append((record, answer_llm, evaluation))

100%|██████████| 235/235 [14:31<00:00,  3.71s/it]


In [27]:
evaluations

[({'doc_id': 0,
   'chunk_id': 0,
   'question': 'What is the date and time of the Aurora Borealis Tour?'},
  'According to the provided context, the date and time of the Aurora Borealis Tour are:\n\n* Date: November 30, 2023\n* Time: 6:30 PM\n\nThese details can be found in the initial message from GetYourGuide, under the section "What to do on the day".',
  {'Relevance': 'RELEVANT',
   'Explanation': 'The generated answer directly addresses the question by providing the specific date and time of the Aurora Borealis Tour, which is the information being requested.'}),
 ({'doc_id': 0,
   'chunk_id': 0,
   'question': 'How many adults are included in the booking?'},
  'Based on the provided context, the booking includes a maximum of 2 adults in each of the rooms mentioned:\n\n1. Comfort Hotel Xpress Tromsø: Standard Double Room - 2 guests\n2. Brix Hostel: Standard Double Room with Shared Bathroom - 2 guests\n\nTherefore, the total number of adults included in the booking is 2 + 2 = 4 adu

In [31]:
df_eval = pd.DataFrame(evaluations, columns=['record', 'answer', 'evaluation'])
# df_eval['id'] = df_eval.record.apply(lambda d: d['id'])
df_eval['question'] = df_eval.record.apply(lambda d: d['question'])

df_eval['relevance'] = df_eval.evaluation.apply(lambda d: d['Relevance'])
df_eval['explanation'] = df_eval.evaluation.apply(lambda d: d['Explanation'])

In [32]:
df_eval.head()

Unnamed: 0,record,answer,evaluation,question,relevance,explanation
0,"{'doc_id': 0, 'chunk_id': 0, 'question': 'What...","According to the provided context, the date an...","{'Relevance': 'RELEVANT', 'Explanation': 'The ...",What is the date and time of the Aurora Boreal...,RELEVANT,The generated answer directly addresses the qu...
1,"{'doc_id': 0, 'chunk_id': 0, 'question': 'How ...","Based on the provided context, the booking inc...","{'Relevance': 'RELEVANT', 'Explanation': 'The ...",How many adults are included in the booking?,RELEVANT,The generated answer directly addresses the qu...
2,"{'doc_id': 0, 'chunk_id': 0, 'question': 'What...","According to the provided context, the total p...","{'Relevance': 'RELEVANT', 'Explanation': 'The ...",What is the total price for the Aurora Boreali...,RELEVANT,The generated answer directly addresses the qu...
3,"{'doc_id': 0, 'chunk_id': 0, 'question': 'In w...","According to the provided context, the Aurora ...","{'Relevance': 'RELEVANT', 'Explanation': 'The ...",In which language will the Aurora Borealis Tou...,RELEVANT,The generated answer directly addresses the qu...
4,"{'doc_id': 0, 'chunk_id': 0, 'question': 'What...","According to the provided context, the duratio...","{'Relevance': 'RELEVANT', 'Explanation': 'The ...",What is the duration of the Aurora Borealis Tour?,RELEVANT,The generated answer directly addresses the qu...


In [33]:
df_eval.relevance.value_counts(normalize=True)

relevance
RELEVANT           0.965957
PARTLY_RELEVANT    0.029787
NON_RELEVANT       0.004255
Name: proportion, dtype: float64