In [2]:
import dotenv
import os
dotenv.load_dotenv()

import nest_asyncio
nest_asyncio.apply()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [3]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.llms.google_genai import GoogleGenAI
import google.genai.types as types

config = types.GenerateContentConfig(
    thinking_config=types.ThinkingConfig(thinking_budget=0),
    max_output_tokens=512,
    temperature=1,
)

llm = GoogleGenAI(
    model="gemini-2.5-flash",
    generation_config=config,
    )

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)
Settings.llm = llm

  from .autonotebook import tqdm as notebook_tqdm


# Create a VectoreStore Permanat in Memory

The primary difference is where the data is stored and how long it lasts.
1. PersistentClient:

Storage Location: Hard Drive (Disk)

2. EphemeralClient:

Storage Location: RAM (Random Access Memory).

In [4]:
import chromadb

# PersistentClient: This is the key part. It tells the system: "Don't just keep this in RAM (memory). Save it to the hard drive.
chomra_client = chromadb.PersistentClient(path="./mini-llama-articles")
# You are creating a specific bucket named "mini-llama-articles" to hold this specific set of data.
try: # if this is exist then we get it
    chroma_collection = chomra_client.create_collection("mini-llama-articles") 
except:
    chroma_collection = chomra_client.get_collection("mini-llama-articles")

In [5]:
from llama_index.vector_stores.chroma import ChromaVectorStore

# Define a storage context object using the created vector database.
# The ChromaVectorStore is a wrapper that translates LlamaIndex commands into commands that ChromaDB understands.
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [6]:
vector_store

ChromaVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=True, collection_name=None, host=None, port=None, ssl=False, headers=None, persist_dir=None, collection_kwargs={})

# Loading the data

In [43]:
import csv

rows = []

# Load the CSV file
with open("./data/mini-dataset.csv", mode="r", encoding="utf-8") as file:
    csv_reader = csv.reader(file)

    for idx, row in enumerate(csv_reader):
        if idx == 0:
            continue
            # Skip header row
        rows.append(row)

# The number of characters in the dataset.
print("number of articles:", len(rows))

number of articles: 14


In [44]:
rows[0]

["Beyond GPT-4: What's New?",
 'LLM Variants and Meta\'s Open Source Before shedding light on four major trends, I\'d share the latest Meta\'s Llama 2 and Code Llama. Meta\'s Llama 2 represents a sophisticated evolution in LLMs. This suite spans models pretrained and fine-tuned across a parameter spectrum of 7 billion to 70 billion. A specialized derivative, Llama 2-Chat, has been engineered explicitly for dialogue-centric applications. Benchmarking revealed Llama 2\'s superior performance over most extant open-source chat models. Human-centric evaluations, focusing on safety and utility metrics, positioned Llama 2-Chat as a potential contender against proprietary, closed-source counterparts. The development trajectory of Llama 2 emphasized rigorous fine-tuning methodologies. Meta\'s transparent delineation of these processes aims to catalyze community-driven advancements in LLMs, underscoring a commitment to collaborative and responsible AI development. Code Llama is built on top of L

# Convert to Document obj


Metadata is helpful because it allows you to filter and organize your data before or after searching it. Without metadata, your RAG system is just searching through a giant, messy pile of text segments based only on "meaning."


In technical terms, metadata turns your "unstructured" text into "semi-structured" data, which gives you control over the retrieval process.

* Vector search (semantic search) is probabilistic—it guesses what is relevant. Metadata allows you to apply strict rules that the AI must follow.
* Sometimes "meaning" isn't enough; you need specific details. Metadata allows you to combine keyword/category search with semantic search.

In [75]:
from llama_index.core import Document

# The raw input (a generic container for data source)
documents = [
    Document(
        text=row[1],
        metadata={"title": row[0], "url": row[2], "source_name": row[3]},
        id_=row[2]
    )
    for row in rows
]

In [76]:
documents[0]

Document(id_='https://pub.towardsai.net/beyond-gpt-4-whats-new-cbd61a448eb9#dda8', embedding=None, metadata={'title': "Beyond GPT-4: What's New?", 'url': 'https://pub.towardsai.net/beyond-gpt-4-whats-new-cbd61a448eb9#dda8', 'source_name': 'towards_ai'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='LLM Variants and Meta\'s Open Source Before shedding light on four major trends, I\'d share the latest Meta\'s Llama 2 and Code Llama. Meta\'s Llama 2 represents a sophisticated evolution in LLMs. This suite spans models pretrained and fine-tuned across a parameter spectrum of 7 billion to 70 billion. A specialized derivative, Llama 2-Chat, has been engineered explicitly for dialogue-centric applications. Benchmarking revealed Llama 2\'s superior performance over most extant open-source chat models. Human-centric evaluations, focusing 

## TokenTextSplitter (The "Mathematical" Splitter)
* This splitter focuses strictly on the number of tokens. It does not care about grammar or sentence structure.
* It counts tokens until it reaches your chunk_size (e.g., 512). Once it hits that limit, it cuts the text.

## SentenceSplitter (The "Grammatical" Splitter)
* This splitter focuses on linguistic completeness. It is designed to keep sentences and paragraphs whole.
* It also respects your chunk_size limit, but it calculates the split differently. It looks for sentence terminators (like ., ?, !) and paragraph breaks (\n\n). It will cut the chunk early (e.g., at 480 tokens instead of 512) just to ensure the last sentence is complete

## What is BaseNode in LlamaIndex

BaseNode is the fundamental building block (or "atom") of data in LlamaIndex.

While a Document represents your entire file (like a whole PDF or text file), a BaseNode represents a specific chunk of that file that the system will actually process, embed, and search for.

Think of BaseNode as a smart container. It is the parent class for specific types of nodes (like TextNode or ImageNode). It holds four critical pieces of information
1. The Content:
2. The Metadata
3. The Embedding:
4. The Relationships

# Transforming

In [67]:
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.schema import BaseNode
import hashlib


def deterministic_id_func(i: int, doc: BaseNode) -> str:
    """Deterministic ID function for the text splitter.
    This will be used to generate a unique repeatable identifier for each node."""
    unique_identifier = doc.id_ + str(i)
    hasher = hashlib.sha256()
    hasher.update(unique_identifier.encode("utf-8"))
    return hasher.hexdigest()


text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=512, chunk_overlap=128, id_func=deterministic_id_func
)

In [68]:
text_splitter

TokenTextSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x00000244BE753350>, id_func=<function deterministic_id_func at 0x00000244CD9BFBA0>, chunk_size=512, chunk_overlap=128, separator=' ', backup_separators=['\n'], keep_whitespaces=False)

When you ran pipeline.run(...), two things happened:
1. Processed: It cut the text and calculated the vectors.
2. Saved: Because you included vector_store=vector_store in the pipeline setup, it automatically saved those vectors into your ChromaDB database on the hard drive.

In [77]:
documents

[Document(id_='https://pub.towardsai.net/beyond-gpt-4-whats-new-cbd61a448eb9#dda8', embedding=None, metadata={'title': "Beyond GPT-4: What's New?", 'url': 'https://pub.towardsai.net/beyond-gpt-4-whats-new-cbd61a448eb9#dda8', 'source_name': 'towards_ai'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='LLM Variants and Meta\'s Open Source Before shedding light on four major trends, I\'d share the latest Meta\'s Llama 2 and Code Llama. Meta\'s Llama 2 represents a sophisticated evolution in LLMs. This suite spans models pretrained and fine-tuned across a parameter spectrum of 7 billion to 70 billion. A specialized derivative, Llama 2-Chat, has been engineered explicitly for dialogue-centric applications. Benchmarking revealed Llama 2\'s superior performance over most extant open-source chat models. Human-centric evaluations, focusing

In [78]:
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(
    transformations=[
        text_splitter,
        # Settings.embed_model, # comment this line when re-using existing vector store
    ],
    vector_store=vector_store,
)

nodes = pipeline.run(documents=documents, show_progress=True)

Parsing nodes: 100%|██████████| 14/14 [00:00<00:00, 38.60it/s]


In [79]:
nodes[0]

TextNode(id_='bed326d2aa2a2a61e9c45f53c61da03385f3193c27387c90ea11db6ea38d9c13', embedding=None, metadata={'title': "Beyond GPT-4: What's New?", 'url': 'https://pub.towardsai.net/beyond-gpt-4-whats-new-cbd61a448eb9#dda8', 'source_name': 'towards_ai'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='https://pub.towardsai.net/beyond-gpt-4-whats-new-cbd61a448eb9#dda8', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'title': "Beyond GPT-4: What's New?", 'url': 'https://pub.towardsai.net/beyond-gpt-4-whats-new-cbd61a448eb9#dda8', 'source_name': 'towards_ai'}, hash='892462426965dc17b5696971312d7c92c96d62822273fda587edbd562d24e074'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='90d6f2b7d18295edf2aaaaaf283b1ba5666039c2e544e99b6a25076d2aeaf255', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='71418de3d50e604c2581574f1abf2248e5cc3ab7c74a3182c37cb1152d0cfd21')}, metadata_template='{key}: {value

The 'nodes' variable is a LIST of TextNode objects. Each TextNode is a "smart container" that holds one specific chunk of text, its metadata, and now its vector embedding.

In [80]:
nodes[0].node_id

'bed326d2aa2a2a61e9c45f53c61da03385f3193c27387c90ea11db6ea38d9c13'

# Load Indexes
We need this because ChromaDB is just a storage bucket, but index is the search engine.

Right now, your data is sitting safely in the database, but your Python script doesn't have a "handle" or a "control panel" to talk to it yet.
* vector_store (The Database): Holds the files. It's passive. It just sits there.
* index (The Engine): This is the LlamaIndex tool that knows how to search that database. It gives you the methods you need, like .as_retriever() or .as_query_engine().

In [81]:
from llama_index.core import VectorStoreIndex

index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

In [15]:
query_engine = index.as_query_engine(llm=Settings.llm, similarity_top_k=5)

In [11]:
res = query_engine.query("How many parameters LLaMA 2 model has?")

In [12]:
res.response

'The Llama 2 model is available in four sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters. However, the 34 billion parameter model has not yet been released.'

In [20]:
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("-_" * 20)

Node ID	 7d497cac3b5f312a37c14edad55286376d788cf3e5e0360d1b38fbbe73efea2e
Title	 Meta's Llama 2: Revolutionizing Open Source Language Models for Commercial Use
Text	 I. Llama 2: Revolutionizing Commercial Use Unlike its predecessor Llama 1, which was limited to research use, Llama 2 represents a major advancement as an open-source commercial model. Businesses can now integrate Llama 2 into products to create AI-powered applications. Availability on Azure and AWS facilitates fine-tuning and adoption. However, restrictions apply to prevent exploitation. Companies with over 700 million active daily users cannot use Llama 2. Additionally, its output cannot be used to improve other language models.  II. Llama 2 Model Flavors Llama 2 is available in four different model sizes: 7 billion, 13 billion, 34 billion, and 70 billion parameters. While 7B, 13B, and 70B have already been released, the 34B model is still awaited. The pretrained variant, trained on a whopping 2 trillion tokens, boasts a

# Evaluate the retrieval process and quality of answers

We can evaluate our RAG system with a dataset of questions and associated chunks. Given a question, we can see if the RAG system retrieves the correct chunks of text that can answer the question.

You can generate a synthetic dataset with an LLM such as `gemini-1.5-flash` or create an authentic and manually curated dataset.

Note that a **well curated dataset will always be a better option**, especially for a specific domain or use case.

In our example, we will generate a synthetic dataset using `gemini-1.5-flash` to make it simple.

This is the default prompt that the `generate_question_context_pairs` function will uses:

```python
DEFAULT_QA_GENERATE_PROMPT_TMPL = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and no prior knowledge,
generate only questions based on the below query.

You are a Teacher/Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided."
"""
```


### 1\. What is `MetadataMode`?

Think of **`MetadataMode`** as a **Visibility Switch** and the **Customizing Attributes** as a **Formatting Style**. They work together to decide exactly what text gets fed into the AI models.


This is a setting that determines **who gets to see the metadata**. You often want the Embedding model to see *different* information than the LLM.

  * **`MetadataMode.EMBED`**: Controls what the **Retrieval Model** sees.
      * *Goal:* You usually show **more** metadata here (like filenames, keywords) so the system can find the document easily.
  * **`MetadataMode.LLM`**: Controls what the **Generation Model** (like GPT-4 or Gemini) sees.
      * *Goal:* You usually show **less** metadata here. You hide distracting things (like internal IDs or filenames) so the LLM focuses only on the content to write a good answer.
  * **`MetadataMode.NONE`**: Shows **only** the text content (no metadata at all).
  * **`MetadataMode.ALL`**: Shows **everything**.

-----
Very Good Example is given here: https://developers.llamaindex.ai/python/framework/module_guides/loading/documents_and_nodes/usage_documents/#:~:text=Knowing%20all%20this%2C%20let%E2%80%99s%20create%20a%20short%20example%20using%20all%20this%20power%3A


## EmbeddingQAFinetuneDataset

This library is a **container** designed to hold your "Test Exam" data in a structured format.

Think of `EmbeddingQAFinetuneDataset` as the digital binder that holds the "Answer Key" for your RAG system. It doesn't *do* any math or processing itself; it simply organizes the questions and answers so other tools (like the Evaluator or Fine-Tuner) can read them.

### What it contains

Inside this object, there are three specific dictionaries that link everything together:

1.  **`queries`**: A list of the questions.
      * *Example:* `{"q1": "What is Llama 2?", "q2": "Who created it?"}`
2.  **`corpus`**: A list of the actual text chunks (Nodes).
      * *Example:* `{"node_A": "Llama 2 is a model...", "node_B": "Meta released it..."}`
3.  **`relevant_docs`**: The "Answer Key" linking them.
      * *Example:* `{"q1": ["node_A"], "q2": ["node_B"]}`

### Why you need it

In the code you just ran:

```python
rag_eval_dataset = generate_question_context_pairs(...)
```

In [None]:
from llama_index.core.llms.utils import LLM
from llama_index.core.schema import MetadataMode, TextNode
from tqdm import tqdm
import json
import re
import uuid
import warnings
import time
from typing import Dict, List, Tuple
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

DEFAULT_QA_GENERATE_PROMPT_TMPL = """\
Context information is below.

---------------------
{context_str}
---------------------

Given the context information and not prior knowledge.
generate only questions based on the below query.

You are a Teacher/ Professor. Your task is to setup \
{num_questions_per_chunk} questions for an upcoming \
quiz/examination. The questions should be diverse in nature \
across the document. Restrict the questions to the \
context information provided."
"""

def generate_question_context_pairs(
    nodes: List[TextNode],
    llm: LLM,
    qa_generate_prompt_tmpl: str = DEFAULT_QA_GENERATE_PROMPT_TMPL,
    num_questions_per_chunk: int = 2,
    request_delay: float = 2.0
) -> EmbeddingQAFinetuneDataset:
    """Generate examples given a set of nodes with delays between requests."""
    node_dict = {
        node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) # Saving only text content and node id
        for node in nodes
    }

    queries = {}
    relevant_docs = {}

    for node_id, text in tqdm(node_dict.items()):
        query = qa_generate_prompt_tmpl.format(
            context_str=text, num_questions_per_chunk=num_questions_per_chunk
        )
        response = llm.complete(query)

        result = str(response).strip().split("\n")
        questions = [
            re.sub(r"^\d+[\).\s]", "", question).strip() for question in result
        ]
        questions = [question for question in questions if len(question) > 0][
            :num_questions_per_chunk
        ]

        num_questions_generated = len(questions)
        if num_questions_generated < num_questions_per_chunk:
            warnings.warn(
                f"Fewer questions generated ({num_questions_generated}) "
                f"than requested ({num_questions_per_chunk})."
            )

        for question in questions:
            question_id = str(uuid.uuid4())
            queries[question_id] = question
            relevant_docs[question_id] = [node_id]

        time.sleep(request_delay)

    # queries contains the generated questions
    # node_dict contains the text content with node ids
    # relevant_docs maps question ids to the corresponding node ids
    return EmbeddingQAFinetuneDataset(
        queries=queries, corpus=node_dict, relevant_docs=relevant_docs
    )


rag_eval_dataset = generate_question_context_pairs(
    nodes,
    llm=Settings.llm,
    num_questions_per_chunk=1, # Generate 1 question per document chunk, but you can change it as needed like 2,3 to better coverage
    request_delay=2
)

# Save the dataset as a json file for later use
rag_eval_dataset.save_json("./rag_eval_dataset.json")


100%|██████████| 108/108 [04:52<00:00,  2.71s/it]


In [83]:
nodes[0]

TextNode(id_='bed326d2aa2a2a61e9c45f53c61da03385f3193c27387c90ea11db6ea38d9c13', embedding=None, metadata={'title': "Beyond GPT-4: What's New?", 'url': 'https://pub.towardsai.net/beyond-gpt-4-whats-new-cbd61a448eb9#dda8', 'source_name': 'towards_ai'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='https://pub.towardsai.net/beyond-gpt-4-whats-new-cbd61a448eb9#dda8', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'title': "Beyond GPT-4: What's New?", 'url': 'https://pub.towardsai.net/beyond-gpt-4-whats-new-cbd61a448eb9#dda8', 'source_name': 'towards_ai'}, hash='892462426965dc17b5696971312d7c92c96d62822273fda587edbd562d24e074'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='90d6f2b7d18295edf2aaaaaf283b1ba5666039c2e544e99b6a25076d2aeaf255', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='71418de3d50e604c2581574f1abf2248e5cc3ab7c74a3182c37cb1152d0cfd21')}, metadata_template='{key}: {value

### Evaluation for Hit Rate and Mean Reciprocal Rank (MRR)

We will make use of `RetrieverEvaluator` available in Llama-index. We will measure the Hit Rate and Mean Reciprocal Rank (MRR).

**Hit Rate:**

Think of the Hit Rate like playing a game of guessing. You're given a question and you need to guess the correct answer from a list of options. The Hit Rate measures how often you guess the correct answer by only looking at your top few guesses. If you often find the right answer in your first few guesses, you have a high Hit Rate. So, in the context of a retrieval system, it's about how frequently the system finds the correct document within its top 'k' picks (where 'k' is a number you decide, like top 5 or top 10).

**Mean Reciprocal Rank (MRR):**

MRR is a bit like measuring how quickly you can find a treasure in a list of boxes. Imagine you have a row of boxes and only one of them has a treasure. The MRR calculates how close to the start of the row the treasure box is, on average. If the treasure is always in the first box you open, you're doing great and have an MRR of 1. If it's in the second box, the score is 1/2, since you took two tries to find it. If it's in the third box, your score is 1/3, and so on. MRR averages these scores across all your searches. So, for a retrieval system, MRR looks at where the correct document ranks in the system's guesses. If it's usually near the top, the MRR will be high, indicating good performance.
In summary, Hit Rate tells you how often the system gets it right in its top guesses, and MRR tells you how close to the top the right answer usually is. Both metrics are useful for evaluating the effectiveness of a retrieval system, like how well a search engine or a recommendation system works.


In [84]:
# We can also load the dataset from a previously saved json file.
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

rag_eval_dataset = EmbeddingQAFinetuneDataset.from_json("./rag_eval_dataset.json")

In [86]:
rag_eval_dataset

EmbeddingQAFinetuneDataset(queries={'713e1107-d354-427d-8840-191719ed0d83': 'Describe the relationship between Llama 2 and Code Llama, and explain the purpose of the three Code Llama variants.', 'd4a90649-8d00-477c-a314-be6446306ef2': 'What is the primary difference in capability between an LLM like ChatGPT (GPT-3.5) and a multimodal model like GPT-4, and how does this difference impact their potential applications?', '3da3ccd3-e6fb-4d34-b033-c5436cab9dd4': '**Compare and contrast** the primary function of LLM connections like LlamaIndex with the purpose of Vector DBs. In your answer, provide specific examples of tools or applications for each.', '7030c866-5cff-4e4c-a5b0-95786523f3a2': 'What are the two primary ways to fine-tune LLMs, and what three technical methods are involved in this process?', 'ea5cf2b5-efe1-45b4-92b9-9ea93e27a2ff': 'What is the primary shift occurring in the approach to LLMs, moving from static optimization to dynamic adaptability, and what technology is replacin

In [87]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x244be79cad0>

Testing the expected node ID and the retrived Node ID.

In [88]:
first_query_id = list(rag_eval_dataset.queries.keys())[0]
# NodeWithScore is the standard delivery box that LlamaIndex uses to hand you back your search results.
expected_node_id = rag_eval_dataset.relevant_docs[first_query_id][0] 
print(f"Expected Node ID (from Dataset): {expected_node_id}")

Expected Node ID (from Dataset): bed326d2aa2a2a61e9c45f53c61da03385f3193c27387c90ea11db6ea38d9c13


In [89]:
query_text = rag_eval_dataset.queries[first_query_id]
retrieved_nodes = index.as_retriever(similarity_top_k=1).retrieve(query_text)
actual_node_id = retrieved_nodes[0].node.node_id
print(f"Actual Node ID (from Database):  {actual_node_id}")

Actual Node ID (from Database):  bed326d2aa2a2a61e9c45f53c61da03385f3193c27387c90ea11db6ea38d9c13


In [23]:
query_text

'Based on the provided text, describe the relationship between Llama 2 and Code Llama, highlighting at least two specific ways Code Llama extends the capabilities of Llama 2.'

Creating the function for retrived results and loop through to get the results.

In [91]:
import pandas as pd


def display_results_retriever(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame(
        {"Retriever Name": [name], "Hit Rate": [hit_rate], "MRR": [mrr]}
    )

    return metric_df

**Why Async?**
Look at the parameter workers=32. This tells the code to run 32 evaluations at the same time. You cannot do this efficiently in standard synchronous Python. You need await to manage these 32 parallel tasks without freezing your computer.

In [92]:
from llama_index.core.evaluation import RetrieverEvaluator

# We can evaluate the retievers with different top_k values.
for i in [2, 4, 6, 8, 10]:
    retriever = index.as_retriever(similarity_top_k=i)
    print("Retraiver: ", retriever)
    retriever_evaluator = RetrieverEvaluator.from_metric_names(
        ["mrr", "hit_rate"], retriever=retriever
    )

    eval_results = await retriever_evaluator.aevaluate_dataset(
        rag_eval_dataset, workers=32
    )
    print(display_results_retriever(f"Retriever top_{i}", eval_results))

time.sleep(60)

Retraiver:  <llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever object at 0x00000244BE728B30>
    Retriever Name  Hit Rate       MRR
0  Retriever top_2  0.268519  0.134259
Retraiver:  <llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever object at 0x00000244BC225430>
    Retriever Name  Hit Rate       MRR
0  Retriever top_4  0.611111  0.152778
Retraiver:  <llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever object at 0x00000244BE7522A0>
    Retriever Name  Hit Rate      MRR
0  Retriever top_6  0.703704  0.16821
Retraiver:  <llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever object at 0x00000244BE125EE0>
    Retriever Name  Hit Rate       MRR
0  Retriever top_8  0.787037  0.174769
Retraiver:  <llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever object at 0x00000244BE1EA5D0>
     Retriever Name  Hit Rate       MRR
0  Retriever top_10  0.814815  0.17

* The Embeddings are "Fuzzy": The low MRR (0.17) is a warning sign. Ideally, MRR should be closer to 0.6 or 0.8. An MRR of 0.17 implies that even when the system does find the right document, it usually thinks it's the 5th or 6th best match, not the 1st. It Need a Wide Net: Because the system isn't confident (low MRR), it needs a larger safety net (top_k=8 or 10) to ensure it catches the right answer.

* The "Sweet Spot":

Top_8 looks like the smartest choice here. It gives you nearly the same accuracy (78%) as Top_10 (81%) but saves you from processing 2 extra chunks of text per query (saving money/tokens).

We can Improve the system by adding doing the following
* Improve Embeddings: Switch from `text-embedding-3-small` (or similar) to a better model.
* Re-Ranking: Add a "Re-Ranker" step. This allows you to fetch 10 documents (high Hit Rate) but then use a smarter model to sort the best one to the top (fixing the MRR). This is a standard advanced technique in RAG.

We also integrate with community evaluation tools.

* UpTrain: https://github.com/uptrain-ai/uptrain
* Tonic Validate(Includes Web UI for visualizing results): https://developers.llamaindex.ai/python/framework/community/integrations/tonicvalidate
* DeepEval: https://github.com/confident-ai/deepeval
* Ragas: https://github.com/explodinggradients/ragas/blob/main/docs/howtos/integrations/llamaindex.ipynb
* RAGChecker: https://github.com/amazon-science/RAGChecker
* Cleanlab: https://developers.llamaindex.ai/python/examples/evaluation/cleanlab

### Evaluation using Relevance and Faithfulness metrics.

Here, we evaluate the answer generated by the LLM. Is the answer using the correct context? Is the answer faithful to the context? Is the answer relevant to the question?

An LLM will answer these questions, more specifically `gpt-5`.

**`FaithfulnessEvaluator`**
Evaluates if the answer is faithful to the retrieved contexts (in other words, whether there's an hallucination).

**`RelevancyEvaluator`**
Evaluates whether the retrieved context and answer are relevant to the user question.

Now, let's see how the top_k value affects these two metrics.


In [94]:
from llama_index.core.evaluation import RelevancyEvaluator, FaithfulnessEvaluator, BatchEvalRunner
from llama_index.llms.openai import OpenAI

# Create your index
from llama_index.core import VectorStoreIndex
index = VectorStoreIndex.from_vector_store(vector_store)

# Recommened to use gpt models for evaluation, but here we use Google Gemini for demonstration
# llm_gpt5 = OpenAI(model="gpt-5", additional_kwargs={'reasoning_effort':'minimal'})
# llm_gpt5_mini = OpenAI(model="gpt-5-mini", additional_kwargs={'reasoning_effort':'minimal'})

# Initiate the faithfulnes and relevancy evaluator objects
faithfulness_evaluator = FaithfulnessEvaluator(llm=llm)
relevancy_evaluator = RelevancyEvaluator(llm=llm)

# Extract the questions from the dataset
queries = list(rag_eval_dataset.queries.values())
# Limit to first 10 question to save time (!!remove this line in production!!)
batch_eval_queries = queries[:20]

# The batch evaluator runs the evaluation in batches
runner = BatchEvalRunner(
    {"faithfulness": faithfulness_evaluator, "relevancy": relevancy_evaluator},
    workers=32,
)


# Define a for-loop to try different `similarity_top_k` values
for i in [2, 4, 6, 8, 10]:
    # Set query engine with different number of returned chunks
    query_engine = index.as_query_engine(similarity_top_k=i, llm = llm)

    # Run the evaluation
    eval_results = await runner.aevaluate_queries(query_engine, queries=batch_eval_queries)

    # Printing the results
    faithfulness_score = sum(
        result.passing for result in eval_results["faithfulness"]
    ) / len(eval_results["faithfulness"])
    print(f"top_{i} faithfulness_score: {faithfulness_score}")

    relevancy_score = sum(result.passing for result in eval_results["relevancy"]) / len(
        eval_results["relevancy"]
    )
    print(f"top_{i} relevancy_score: {relevancy_score}")
    print("="*15)


top_2 faithfulness_score: 1.0
top_2 relevancy_score: 0.9
top_4 faithfulness_score: 1.0
top_4 relevancy_score: 1.0
top_6 faithfulness_score: 1.0
top_6 relevancy_score: 1.0
top_8 faithfulness_score: 1.0
top_8 relevancy_score: 1.0
top_10 faithfulness_score: 1.0
top_10 relevancy_score: 1.0


### Correctness


In [95]:
from llama_index.core.evaluation import CorrectnessEvaluator

query = (
    "Can you explain the theory of relativity proposed by Albert Einstein in" " detail?"
)

reference = """
Certainly! Albert Einstein's theory of relativity consists of two main components: special relativity and general relativity. Special relativity, published in 1905, introduced the concept that the laws of physics are the same for all non-accelerating observers and that the speed of light in a vacuum is a constant, regardless of the motion of the source or observer. It also gave rise to the famous equation E=mc², which relates energy (E) and mass (m).

General relativity, published in 1915, extended these ideas to include the effects of gravity. According to general relativity, gravity is not a force between masses, as described by Newton's theory of gravity, but rather the result of the warping of space and time by mass and energy. Massive objects, such as planets and stars, cause a curvature in spacetime, and smaller objects follow curved paths in response to this curvature. This concept is often illustrated using the analogy of a heavy ball placed on a rubber sheet, causing it to create a depression that other objects (representing smaller masses) naturally move towards.

In essence, general relativity provided a new understanding of gravity, explaining phenomena like the bending of light by gravity (gravitational lensing) and the precession of the orbit of Mercury. It has been confirmed through numerous experiments and observations and has become a fundamental theory in modern physics.
"""

response = """
Certainly! Albert Einstein's theory of relativity consists of two main components: special relativity and general relativity. Special relativity, published in 1905, introduced the concept that the laws of physics are the same for all non-accelerating observers and that the speed of light in a vacuum is a constant, regardless of the motion of the source or observer. It also gave rise to the famous equation E=mc², which relates energy (E) and mass (m).

However, general relativity, published in 1915, extended these ideas to include the effects of magnetism. According to general relativity, gravity is not a force between masses but rather the result of the warping of space and time by magnetic fields generated by massive objects. Massive objects, such as planets and stars, create magnetic fields that cause a curvature in spacetime, and smaller objects follow curved paths in response to this magnetic curvature. This concept is often illustrated using the analogy of a heavy ball placed on a rubber sheet with magnets underneath, causing it to create a depression that other objects (representing smaller masses) naturally move towards due to magnetic attraction.
"""

In [105]:
evaluator = CorrectnessEvaluator(llm=llm)

result = await evaluator.aevaluate(
    query=query,
    response=response,
    reference=reference
)

In [106]:
result.score

2.0

In [108]:
print(result.feedback)

The generated answer explains general relativity incorrectly by stating that magnetism is the cause of warping space and time. It also incorrectly states that the analogy of a heavy ball on a rubber sheet includes magnets underneath.
