In [1]:
# pip install llama-index llama-index-embeddings-huggingface llama-index-llms-google-genai llama-index-llms-groq llama-index-vector-stores-chroma chromadb huggingface-hub

In [2]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.llms.groq import Groq
from llama_index.core import VectorStoreIndex
import google.genai.types as types
from google import genai
import os

import nest_asyncio
nest_asyncio.apply()


from google.colab import userdata
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')


# model = "gemini-2.5-flash-preview-09-2025"
model = "gemini-2.5-flash"
# model = "llama-3.1-8b-instant"
embedding_model = "intfloat/e5-small-v2"

config = types.GenerateContentConfig(
    thinking_config=types.ThinkingConfig(thinking_budget=0),
    temperature=0.4 # set this to make this less chatty and more deterministic to save the tokens
)

Settings.llm = GoogleGenAI(
    model=model,
    api_key=GOOGLE_API_KEY,
    generation_config=config,
)

Settings.embed_model = HuggingFaceEmbedding(
    model_name=embedding_model,
    device="cuda"                    # Use "cuda" if you have a GPU
)
Settings.text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=128)


client = genai.Client(api_key=GOOGLE_API_KEY)
model_info = client.models.get(model=model)
print(f"{model_info.input_token_limit=}")
print(f"{model_info.output_token_limit=}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

model_info.input_token_limit=1048576
model_info.output_token_limit=65536


Downloading vector store from Huggingface hub

In [3]:
# from huggingface_hub import hf_hub_download
# vectorstore = hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="vectorstore.zip",repo_type="dataset",local_dir="/content")

In [11]:
import os

os.getcwd()

'/content'

In [3]:
!unzip "ai_tutor_knowledge.zip"

Archive:  ai_tutor_knowledge.zip
   creating: content/ai_tutor_knowledge/
   creating: content/ai_tutor_knowledge/3a3bc9f8-5760-45af-b368-453dd0ecec22/
  inflating: content/ai_tutor_knowledge/3a3bc9f8-5760-45af-b368-453dd0ecec22/link_lists.bin  
  inflating: content/ai_tutor_knowledge/3a3bc9f8-5760-45af-b368-453dd0ecec22/length.bin  
  inflating: content/ai_tutor_knowledge/3a3bc9f8-5760-45af-b368-453dd0ecec22/data_level0.bin  
  inflating: content/ai_tutor_knowledge/3a3bc9f8-5760-45af-b368-453dd0ecec22/index_metadata.pickle  
  inflating: content/ai_tutor_knowledge/3a3bc9f8-5760-45af-b368-453dd0ecec22/header.bin  
  inflating: content/ai_tutor_knowledge/chroma.sqlite3  


In [22]:
# !rm -rf "ai_tutor_knowledge"

In [4]:
# Load the vector store from the local storage.
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

# try:
#     db.delete_collection("./mini-llama-articles/ai_tutor_knowledge")
#     print("Collection deleted. Starting fresh!")
# except:
#     print("Collection didn't exist or was already deleted.")

db2 = chromadb.PersistentClient(path="/content/ai_tutor/ai_tutor_knowledge")
chroma_collection = db2.get_or_create_collection("ai_tutor_knowledge")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [5]:
# Check how many chunks are in your database
print(f"Total items in collection: {chroma_collection.count()}")

# List existing collections to ensure the name matches perfectly
print(f"Existing collections: {db2.list_collections()}")

Total items in collection: 2573
Existing collections: [Collection(name=ai_tutor_knowledge)]


In [6]:
from llama_index.core import VectorStoreIndex

# Create the index based on the vector store.
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

In [26]:
query_engine = index.as_query_engine(similarity_top_k=10)

res = query_engine.query("Explain how Advance RAG works?")

In [27]:
res.response

'The provided context does not contain information on how Advanced RAG works, only that it was used for POC purposes.'

In [None]:
# import os
# os._exit(0)

# RankGPT


In LlamaIndex, **Node Postprocessors** are specialized modules that transform, filter, or re-rank a set of nodes after they have been retrieved from an index but **before** they are sent to the LLM for response generation.

They are a critical part of an "Advanced RAG" workflow, ensuring that the context sent to the model is as relevant, high-quality, and cost-effective as possible.

### **How They Work in the Pipeline**

Postprocessors sit between the **Retriever** and the **Response Synthesizer**:

1. **Retrieval**: The system finds the top  nodes (chunks) most similar to your query.
2. **Postprocessing**: The postprocessors take those  nodes and apply rules to change them (e.g., delete irrelevant ones, reorder them, or add more text).
3. **Synthesis**: The final refined list of nodes is sent to the LLM to write the answer.

---

### **The TimeWeightedPostprocessor**

The **`TimeWeightedPostprocessor`** is a specific module used to handle time-sensitive data. It helps the system choose the most **recent** information when multiple versions of a document exist or when information naturally decays over time.

#### **Key Features**

* **Recency Ranking**: It re-ranks nodes based on a combination of their original similarity score and their "age" (stored in metadata like a timestamp).
* **Time Decay**: It uses a `time_decay` factor to determine how quickly a document's relevance drops as it gets older.
* **Use Case**: If you have three versions of a policy manual from 2022, 2023, and 2024, this postprocessor ensures the 2024 version is prioritized, even if the older versions have slightly higher keyword similarity.

---

### **Other Common Postprocessors**

| Postprocessor | Purpose |
| --- | --- |
| **SimilarityPostprocessor** | Filters out any nodes that fall below a specific similarity score (e.g., discard anything with a score < 0.7). |
| **KeywordNodePostprocessor** | Ensures retrieved nodes contain mandatory keywords or do not contain excluded "negative" keywords. |
| **LLM Rerank** | Uses a second, more powerful LLM to double-check and re-sort the retrieved nodes to ensure they truly answer the question. |
| **LongContextReorder** | Reorders nodes so the most important info is at the beginning or end of the context, preventing the LLM from getting "lost in the middle". |
| **MetadataReplacement** | Swaps a small chunk of text (like a single sentence) with its larger surrounding context (the full paragraph) before sending it to the LLM. |

> LongContextReorder is from the paper "Lost In The Middle"


In [28]:
from llama_index.core.postprocessor.rankGPT_rerank import RankGPTRerank

rankgpt = RankGPTRerank(
    top_n = 3,
    llm = Settings.llm,
    verbose = True
)

In [36]:
# Define a query engine that is responsible for retrieving related pieces of text,
# and using a LLM to formulate the final answer.
# The `node_postprocessors` function will be applied to the retrieved nodes.
query_engine = index.as_query_engine(similarity_top_k=10, node_postprocessors=[rankgpt])

res = query_engine.query("Explain how Retrieval Augmented Generation (RAG) works?")

After Reranking, new rank list for nodes: [4, 6, 1, 5, 0, 3, 9, 2, 7, 8]

In [37]:
res.response

'Retrieval Augmented Generation (RAG) models integrate pretrained dense retrieval (DPR) with sequence-to-sequence (seq2seq) models. These systems function by retrieving relevant documents and then passing them to a seq2seq model to generate outputs. The retriever and seq2seq components are initialized from pretrained models and are fine-tuned together, enabling both retrieval and generation processes to adapt to specific tasks.\n\nRAG addresses the issue of outdated knowledge in Large Language Models (LLMs) by connecting them to external, real-time data sources through retrieval mechanisms. This allows the LLM to combine its generative capabilities with the ability to search for and incorporate pertinent information from one or more knowledge bases.\n\nRAG systems can be classified based on several factors:\n*   **Source of Information:** This can include traditional databases, vector databases, knowledge graphs, or the internet.\n*   **Retrieval Mechanism:** Methods for collecting inf

In [38]:
# Show the retrieved nodes
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text.strip())
    print("Score\t", src.score)
    print("-_" * 20)

Node ID	 68528fe3-e101-4358-b3db-f0894346b348
Title	 Fine-Tuning LLMs with Synthetic Data for High-Quality Content Generation
Text	 during the training sessions  and this is usually true for the entire machine learning field. As the training process for LLMs is resource-intensive  costly  and time-consuming  it happens only at intervals of months (sometimes more)  and the model knowledge quickly becomes outdated. Frequent custom fine-tuning cycles are an option  but beyond being expensive  doing so indiscriminately can lead to a problem known as Catastrophic Forgetting (Catastrophic inferencing is also a common term for this phenomenon)  where the models forget previously learned knowledge. Plus  the models dont have access to real-time data. A more viable solution to deal with this is RAG.   RAG stands for Retrieval Augmented Generation  the name given to a family of processes that focuses on connecting the LLM to external sources through retrieval mechanisms. A combination of the gen

Now, I will use the **LongContextReorder** to arrage the responses in the start and the end to gives the best related response and LLM doesn't lost in the middle. This doesn't use LLM behind

In [33]:
from llama_index.core.postprocessor import LongContextReorder

reorder = LongContextReorder()
query_engine = index.as_query_engine(similarity_top_k=10, node_postprocessors=[reorder])

res = query_engine.query("Explain how Retrieval Augmented Generation (RAG) works?")

In [34]:
res.response

'Retrieval Augmented Generation (RAG) models combine a pre-trained dense retrieval (DPR) system with sequence-to-sequence (seq2seq) models. These models function by retrieving relevant documents, passing them to a seq2seq model, and then marginalizing to generate outputs. Both the retriever and seq2seq modules are initialized from pre-trained models and are jointly fine-tuned to adapt to specific downstream tasks.\n\nThere are two main formulations for RAG:\n1.  One approach conditions on the same retrieved passages for the entire generated sequence.\n2.  Another approach allows for the use of different retrieved passages for each token generated.\n\nRAG addresses the limitations of outdated knowledge in large language models (LLMs) by connecting them to external, real-time data sources through retrieval mechanisms. This allows the LLM to combine its generative capabilities with the ability to search for and incorporate relevant information from various knowledge bases. RAG systems can

In [35]:
# Show the retrieved nodes
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text.strip())
    print("Score\t", src.score)
    print("-_" * 20)

Node ID	 5cbb84b5-c5e5-4e94-a189-43fa6bc9b138
Title	 RAG
Text	 extractive downstream tasks. We explore ageneral-purpose fine-tuning recipe for retrieval-augmented generation (RAG) — models which combine pre-trainedparametric and non-parametric memory for language generation. We introduce RAG models where the parametric memory is apre-trained seq2seq model and the non-parametric memory is a dense vector index of Wikipedia, accessed with apre-trained neural retriever. We compare two RAG formulations, one which conditions on the same retrieved passagesacross the whole generated sequence, the other can use different passages per token. We fine-tune and evaluate ourmodels on a wide range of knowledge-intensive NLP tasks and set the state-of-the-art on three open domain QA tasks,outperforming parametric seq2seq models and task-specific retrieve-and-extract architectures. For language generationtasks, we find that RAG models generate more specific, diverse and factual language than a state-of

# Custom Postprocessor


## **1. The `judger` function (The "Brain")**

This function is a standalone utility that uses an LLM to evaluate text relevancy.

* **Structured Output (Pydantic):** It defines an `OrderedNodes` class. By passing this to `llm.structured_predict`, it forces the LLM to return a strictly formatted JSON object instead of a chatty response. This makes the scores easy to extract in code.
* **XML Tagging (Context Locking):** Wrapping data in `<NODE>` and `<QUERY>` tags prevents the LLM from getting confused between the user's question and the content of the articles.
* **Listwise Judging:** Unlike some rerankers that look at one node at a time, this function passes **all nodes** to the LLM at once. This allows the LLM to compare them against each other to decide which is truly the "best".
* **Proximity Scoring:** It asks for a decimal between 0 and 1. This "normalized" score allows for easy mathematical sorting in the next step.

---

## **2. `GerminiAsJudgePostprocessor` (The "Pipeline Bridge")**

This class is the "glue" that allows LlamaIndex to use your `judger` automatically during a query.

* **Inheritance:** By inheriting from `BaseNodePostprocessor`, this class gains the ability to "plug into" any LlamaIndex `QueryEngine`.
**The Internal Hook (`_postprocess_nodes`):**
* Link: https://developers.llamaindex.ai/python/framework-api-reference/postprocessor/
* **Inputs:** It receives `nodes` (the raw results from the vector database) and `query_bundle` (the user's question).
* **Integration:** it calls your `judger` function and retrieves the list of scores.


**The Sorting & Truncation Logic:**
1. **Sorting:** It sorts the nodes from highest score (1.0) to lowest (0.0).
2. **Selection:** It uses `min(3, len(sorted_scores))` as a safety check to select **exactly the top 3 nodes**.
3. **Filtering:** It returns only those 3 nodes to the LLM.



---


In [39]:
from pydantic import BaseModel
from llama_index.core.prompts import PromptTemplate

def llmJudge( nodes, query ):

  class OrderedNodes(BaseModel):
    """A node with the id and assigned score."""

    node_id: list
    score: list

# Prepare the nodes and wrap them in <NODE></NODE> identifier, as well as the query
# Now we are emumerating node (TextNode) in NodeWithScore, that contains node_id, text and metadata
  total_nodes = ""
  for idx, item in enumerate(nodes):
    # We could use item.text as well instead of get_text()
    total_nodes += f"""
    <NODE{idx+1}>\n
      Node ID: {item.node_id}\n
      Text {item.get_text()}\n
    </NODE{idx+1}>\n
    """

  query = "<QUERY>\n{}\n</QUERY>".format(query)

  # Define the prompt template
  prompt_tmpl = PromptTemplate(
      """
  You receive a qurey along with a list of nodes' text and their ids. Your task is to assign score
  to each node based on its contextually closeness to the given query. The final output is each
  node id along with its proximity score.
  Here is the list of nodes:
  {nodes_list}

  And the following is the query:
  {user_query}

  Score each of the nodes based on their text and their relevancy to the provided query.
  The score must be a decimal number between 0 an 1 so we can rank them."""
  )
  ordered_nodes = Settings.llm.structured_predict(
    OrderedNodes,
    prompt_tmpl,
    nodes_list=total_nodes,
    user_query=query
  )

  return ordered_nodes


## Define Postprocessor

The following class will use the `judger` function to rank the nodes, and filter them based on the ranks.


In [41]:
from typing import List, Optional
from llama_index.core import QueryBundle
from llama_index.core.postprocessor.types import BaseNodePostprocessor
from llama_index.core.schema import NodeWithScore


class GerminiJudgePostProcessor(BaseNodePostprocessor):
  def _postprocess_nodes(self, nodes: List[NodeWithScore], query_bundle: Optional[QueryBundle] = None) -> List[NodeWithScore]:

    # query_bundle: This dataclass contains the original query string and associated transformations.
    # nodes:
    obj = llmJudge( nodes, query_bundle )
    node_ids = obj.node_id
    scores = obj.score

    print("Node IDs:", node_ids)
    print("Scores:", scores)

    # sort the nodes and extracted the top 3
    sorted_scores = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
    num_nodes_to_select = min(3, len(sorted_scores)) # This line prevents your code from crashing if the retriever finds fewer than 3 nodes.
    top_nodes = [sorted_scores[i][0] for i in range(num_nodes_to_select)]

    selected_nodes_id = [node_ids[item] for item in top_nodes]

    final_nodes = []
    for item in nodes:
        if item.node_id in selected_nodes_id:
            final_nodes.append(item)

    return final_nodes


In [35]:
judge = GerminiJudgePostProcessor()

In [42]:
# Define a query engine that is responsible for retrieving related pieces of text,
# and using a LLM to formulate the final answer.
# The `node_postprocessors` function will be applied to the retrieved nodes.
query_engine = index.as_query_engine(similarity_top_k=10, node_postprocessors=[judge])

res = query_engine.query("Explain how Retrieval Augmented Generation (RAG) works?")

nodes [NodeWithScore(node=TextNode(id_='25b89322-bdda-4348-a4b0-3245ee8d524b', embedding=None, metadata={'url': 'https://towardsai.net/p/machine-learning/fine-tuning-llms-with-synthetic-data-for-high-quality-content-generation', 'title': 'Fine-Tuning LLMs with Synthetic Data for High-Quality Content Generation', 'tokens': 7359, 'source': 'tai_blog', 'questions_this_excerpt_can_answer': '1. What are two approaches for integrating RAG with LLMs based on when the RAG process is triggered?\n2. What file format was chosen for the documents in the POC and why?', 'prev_section_summary': "1. **Catastrophic Forgetting:** This occurs when LLMs, during frequent custom fine-tuning, forget previously learned knowledge.\n2. **RAG (Retrieval Augmented Generation):** RAG addresses outdated LLM knowledge by connecting the LLM to external, real-time data sources via retrieval mechanisms. It combines the LLM's generative capabilities with the ability to search and incorporate relevant information from kn

In [43]:
res.response

"Retrieval Augmented Generation (RAG) is a technique that integrates external, real-time data sources with large language models (LLMs) to enhance their knowledge and content generation capabilities. It addresses the limitation of LLMs having outdated knowledge by connecting them to retrieval mechanisms that can search and incorporate relevant information from various knowledge bases.\n\nThe process involves combining the generative abilities of an LLM with the capacity to search for and integrate pertinent information. RAG systems can vary based on several factors:\n\n*   **Source of Information:** These can include traditional databases, vector databases, knowledge graphs, or the internet.\n*   **Retrieval Mechanism:** Methods for collecting information can range from search engines and APIs to customized database searches.\n*   **Integration Method:** RAG can be integrated either before the user's prompt reaches the LLM, where the retrieval process enhances the prompt, or after the 

In [38]:
# Show the retrieved nodes
for src in res.source_nodes:
    print("Node ID\t", src.node_id)
    print("Title\t", src.metadata["title"])
    print("Text\t", src.text)
    print("Score\t", src.score)
    print("-_" * 20)

Node ID	 5cbb84b5-c5e5-4e94-a189-43fa6bc9b138
Title	 RAG
Text	 extractive downstream tasks. We explore ageneral-purpose fine-tuning recipe for retrieval-augmented generation (RAG) — models which combine pre-trainedparametric and non-parametric memory for language generation. We introduce RAG models where the parametric memory is apre-trained seq2seq model and the non-parametric memory is a dense vector index of Wikipedia, accessed with apre-trained neural retriever. We compare two RAG formulations, one which conditions on the same retrieved passagesacross the whole generated sequence, the other can use different passages per token. We fine-tune and evaluate ourmodels on a wide range of knowledge-intensive NLP tasks and set the state-of-the-art on three open domain QA tasks,outperforming parametric seq2seq models and task-specific retrieve-and-extract architectures. For language generationtasks, we find that RAG models generate more specific, diverse and factual language than a state-of

> **IMPORTANT NOTE** It is better to use the distilled LLM, which is the mini version of the Master (teacher) model, this reduces the cost and letency.

> Normally we prefer GPTs model as judge