In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
%pwd

'd:\\Study\\VietAI\\CV_Chatbot\\notebook'

In [3]:
data = pd.read_csv('../data/llama_blog.csv')

In [4]:
data.head()

Unnamed: 0,source,title,url,date,content
0,/blog/introducing-llamaindex-0-11,Introducing LlamaIndex 0.11,https://www.llamaindex.ai/blog/introducing-lla...,"Aug 22, 2024",LlamaIndex is delighted to announce that we ha...
1,/blog/efficient-chunk-size-optimization-for-ra...,Efficient Chunk Size Optimization for RAG Pipe...,https://www.llamaindex.ai/blog/efficient-chunk...,"Aug 21, 2024",In Retrieval-Augmented Generation (RAG) system...
2,/blog/llamaindex-newsletter-2024-08-20,LlamaIndex Newsletter 2024-08-20,https://www.llamaindex.ai/blog/llamaindex-news...,"Aug 20, 2024","Hi there, Llama Lovers! 🦙\n\nWelcome to this w..."
3,/blog/llamaindex-newsletter-2024-08-13,LlamaIndex Newsletter 2024-08-13,https://www.llamaindex.ai/blog/llamaindex-news...,"Aug 13, 2024","Hi there, Llama Fans! 🦙\n\nWelcome to this wee..."
4,/blog/llamaindex-newsletter-2024-08-06,LlamaIndex Newsletter 2024-08-06,https://www.llamaindex.ai/blog/llamaindex-news...,"Aug 6, 2024","Greetings, Llama Lovers! 🦙\n\nWelcome to this ..."


In [7]:
import re 

# Clean the text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

data['title'] = data['title'].apply(clean_text)
data['content'] = data['content'].apply(clean_text)

In [11]:
from dotenv import load_dotenv

load_dotenv("../.env")

True

In [13]:
import os
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [15]:
from llama_index.core import Settings, Document, VectorStoreIndex, StorageContext
from llama_index.embeddings.openai import OpenAIEmbedding


# Initialize the OpenAI  embedding
Settings.embedding = OpenAIEmbedding(model = "text-embedding-3-small")

documents = [
    Document(
        text=row['content'],
        metadata={
            'source': row['source'],
            'title': row['title'],
            'url': row['url'],
            'date': row['date'],
        },
    )
    for index, row in data.iterrows()
]

In [16]:
from llama_index.core import Settings, Document, VectorStoreIndex, StorageContext
from llama_index.core.node_parser import SentenceSplitter


chunk_sizes = [256, 512, 1024, 2048]
nodes_list = []
vector_indices = []
for chunk_size in chunk_sizes:
    print(f"Chunk Size: {chunk_size}")
    nodes = SentenceSplitter(chunk_size=chunk_size,
                                chunk_overlap=128,
                                paragraph_separator="\n\n",
                            ).get_nodes_from_documents(documents)

    # add chunk size to nodes to track later
    for node in nodes:
        node.metadata["chunk_size"] = chunk_size
        node.excluded_embed_metadata_keys = ["chunk_size"]
        node.excluded_llm_metadata_keys = ["chunk_size"]

    nodes_list.append(nodes)

    # build vector index
    vector_index = VectorStoreIndex(nodes)
    vector_indices.append(vector_index)

Chunk Size: 256
Chunk Size: 512
Chunk Size: 1024
Chunk Size: 2048


In [None]:
from llama_index.llms.openai import OpenAI

Settings.llm = OpenAI(model = "gpt-4o")
Settings.llm

In [18]:
from llama_index.core.tools import RetrieverTool
from llama_index.core.schema import IndexNode

# retriever_tools = []
retriever_dict = {}
retriever_nodes = []
for chunk_size, vector_index in zip(chunk_sizes, vector_indices):
    node_id = f"chunk_{chunk_size}"
    node = IndexNode(
        text=(
            "Retrieves relevant context from the all Llama Blog posts. chunk_size: "
            f" {chunk_size})"
        ),
        index_id=node_id,
    )
    retriever_nodes.append(node)
    retriever_dict[node_id] = vector_index.as_retriever()

In [21]:
from llama_index.core.selectors import PydanticMultiSelector

from llama_index.core.retrievers import RouterRetriever
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core import SummaryIndex

# the derived retriever will just retrieve all nodes
summary_index = SummaryIndex(retriever_nodes)

retriever = RecursiveRetriever(
    root_id="root",
    retriever_dict={"root": summary_index.as_retriever(), **retriever_dict},
)

In [22]:
# define reranker
from llama_index.core.postprocessor import LLMRerank, SentenceTransformerRerank

reranker = LLMRerank()

In [26]:
from llama_index.core.query_engine import RetrieverQueryEngine

query_engine  = RetrieverQueryEngine(retriever, node_postprocessors=[reranker])


In [27]:
response = query_engine.query(
    "What are the two main metrics used to evaluate the performance of the different rerankers in the RAG system?"
)

In [None]:
from llama_index.core.response.notebook_utils import display_response
display_response(
    response, show_source=True, source_length=500, show_source_metadata=True
)

In [30]:
from collections import defaultdict
import pandas as pd


def mrr_all(metadata_values, metadata_key, source_nodes):
    # source nodes is a ranked list
    # go through each value, find out positioning in source_nodes
    value_to_mrr_dict = {}
    for metadata_value in metadata_values:
        mrr = 0
        for idx, source_node in enumerate(source_nodes):
            if source_node.node.metadata[metadata_key] == metadata_value:
                mrr = 1 / (idx + 1)
                break
            else:
                continue

        # normalize AP, set in dict
        value_to_mrr_dict[metadata_value] = mrr

    df = pd.DataFrame(value_to_mrr_dict, index=["MRR"])
    df.style.set_caption("Mean Reciprocal Rank")
    return df

In [32]:
print("Mean Reciprocal Rank for each Chunk Size")
mrr_all(chunk_sizes, "chunk_size", response.source_nodes)

Mean Reciprocal Rank for each Chunk Size


Unnamed: 0,256,512,1024,2048
MRR,1.0,0.333333,0.2,0.166667
