<a href="https://colab.research.google.com/github/msquareddd/ai-engineering-notebooks/blob/main/RAG/test_rag_docling_llamaindex_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG with Docling/LlamaIndex

In [None]:
%pip install -q --progress-bar off --no-warn-conflicts llama-index-llms-vllm llama-index-core llama-index-readers-docling llama-index-node-parser-docling llama-index-embeddings-huggingface llama-index-readers-file python-dotenv transformers bitsandbytes accelerate faiss-cpu llama-index-vector-stores-faiss vllm llama-index-embeddings-vllm huggingface_hub

In [None]:
import os
from pathlib import Path
from warnings import filterwarnings
from dotenv import load_dotenv
import faiss


filterwarnings(action="ignore", category=UserWarning, module="pydantic")
filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
# https://github.com/huggingface/transformers/issues/5486:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_HOME"] = "model/"

In [None]:
from huggingface_hub import login
from google.colab import userdata, files

# Set up LangSmith (optional)
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = userdata.get('LANGSMITH_API_KEY')
os.environ["LANGSMITH_PROJECT"] = "RAG LlamaIndex"

# HuggingFace login
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

We can now define the main parameters:

In [None]:
MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507"#"mistralai/Magistral-Small-2509"#"nvidia/NVIDIA-Nemotron-Nano-9B-v2" # "nemotron-mini:4b" "" ""
VLLM_MODEL = "Qwen/Qwen2.5-14B-Instruct"
EMBED_MODEL = "google/embeddinggemma-300m" #"BAAI/bge-small-en-v1.5"
DB_PATH = "./data/faiss_index.bin"

# Using Hugging Face

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
from llama_index.llms.huggingface import HuggingFaceLLM

embeddings = HuggingFaceEmbedding(model_name=EMBED_MODEL)

quant_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

llm = HuggingFaceLLM(
    model_name=MODEL,
    tokenizer_name=MODEL,
    context_window=8000, # Increased context window size
    max_new_tokens=1024,
    model_kwargs={"quantization_config": quant_config_4bit, "dtype": torch.bfloat16},
    generate_kwargs={"temperature": 0.7, "do_sample": True,"top_p": 0.95},
    device_map="auto",
)

# Using vLLM

In [None]:
from llama_index.llms.vllm import Vllm

# os.environ["HUGGINGFACE_API_KEY"] = "hf_############################"

llm = Vllm(
    model=VLLM_MODEL,
    dtype="float16",
    tensor_parallel_size=1,
    temperature=0.9,
    max_new_tokens=512,
    vllm_kwargs={
        "swap_space": 1,
        "gpu_memory_utilization": 0.6,
        "max_model_len": 4096,
    },
)

In [None]:
from llama_index.embeddings.vllm import VllmEmbedding

embeddings = VllmEmbedding(
        model_name=EMBED_MODEL,
        dtype="float16",
        tensor_parallel_size=1,
        # You can pass additional vllm-specific arguments here
        vllm_kwargs={
            "swap_space": 1,
            "gpu_memory_utilization": 0.6,
        },
    )

In [None]:
SOURCE = "/content/data/80019_Marelli_MotorSport_Report.docx"

embed_dim = len(embeddings.get_text_embedding("hi"))

In [None]:
# if os.path.exists(DB_PATH):
#     # Load existing index
#     faiss_index = faiss.read_index(DB_PATH)
#     print("Index loaded successfully!")
# else:
#     # Create new index
faiss_index = faiss.IndexFlatL2(embed_dim)
print("Index created successfully!")

In [None]:
def save_faiss_index(index, path):
    """Helper function to save FAISS index"""
    faiss.write_index(index, path)
    print(f"FAISS index saved to {path}")

In [None]:
def check_faiss_index_status(index):
    """Check the status of the FAISS index"""
    print(f"FAISS index contains {index.ntotal} vectors")
    print(f"Index dimension: {index.d}")

# Using Markdown export

In [None]:
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.readers.docling import DoclingReader
from llama_index.vector_stores.faiss import FaissVectorStore

reader = DoclingReader()
node_parser = MarkdownNodeParser()

vector_store = FaissVectorStore(faiss_index=faiss_index)
print("Vector store created successfully!")

index = VectorStoreIndex.from_documents(
    documents=reader.load_data(SOURCE),
    transformations=[node_parser],
    storage_context=StorageContext.from_defaults(vector_store=vector_store),
    embed_model=embeddings,
)

save_faiss_index(faiss_index, DB_PATH)
check_faiss_index_status(faiss_index)

In [None]:
QUERY = "Che risultati ha dato l'analisi metallografica fatta nel progetto Marelli?"

result = index.as_query_engine(llm=llm, similarity_top_k=5).query(QUERY)

print("Query executed successfully!\n\n")

print(f"Q: {QUERY}\nA: {result.response.strip()}\n\nSources:")
display([(n.text, n.metadata) for n in result.source_nodes])

# Using Docling format

To leverage Docling's rich native format, we:
- create a `DoclingReader` with JSON export type, and
- employ a `DoclingNodeParser` in order to appropriately parse that Docling format.

Notice how the sources now also contain document-level grounding (e.g. page number or bounding box information):

In [None]:
from llama_index.node_parser.docling import DoclingNodeParser

faiss_index = faiss.IndexFlatL2(embed_dim)
print("Index reset successfully!")

vector_store = FaissVectorStore(faiss_index=faiss_index)
print("Vector store created successfully!")

reader = DoclingReader(export_type=DoclingReader.ExportType.JSON)
node_parser = DoclingNodeParser()

index = VectorStoreIndex.from_documents(
    documents=reader.load_data(SOURCE),
    transformations=[node_parser],
    storage_context=StorageContext.from_defaults(vector_store=vector_store),
    embed_model=embeddings,
)

print("Index created successfully!")
save_faiss_index(faiss_index, DB_PATH)
check_faiss_index_status(faiss_index)

In [None]:
QUERY = "Che risultati ha dato l'analisi metallografica fatta nel progetto Marelli?"

result = index.as_query_engine(llm=GEN_MODEL, similarity_top_k=5).query(QUERY)
print(f"Q: {QUERY}\nA: {result.response.strip()}\n\nSources:")
display([(n.text, n.metadata) for n in result.source_nodes])

# With Simple Directory Reader

To demonstrate this usage pattern, we first set up a test document directory.

In [None]:
DIR_PATH = "/content/data"

Using the `reader` and `node_parser` definitions from any of the above variants, usage with `SimpleDirectoryReader` then looks as follows:

In [None]:
from llama_index.core import SimpleDirectoryReader

reader = DoclingReader(export_type=DoclingReader.ExportType.MARKDOWN)
node_parser = MarkdownNodeParser()

dir_reader = SimpleDirectoryReader(
    input_dir=DIR_PATH,
    file_extractor={".pdf": reader,
                    ".docx": reader,
                    ".pptx": reader,
                    ".xlsx": reader,
                    },
)

index = VectorStoreIndex.from_documents(
    documents=dir_reader.load_data(),
    transformations=[node_parser],
    storage_context=StorageContext.from_defaults(vector_store=vector_store),
    embed_model=embeddings,
)

print("Index created successfully!")
save_faiss_index(faiss_index, DB_PATH)
check_faiss_index_status(faiss_index)

In [None]:
QUERY = "Corrada ha collaborato con UNIVAQ?"

In [None]:
result = index.as_query_engine(llm=GEN_MODEL, similarity_top_k=5).query(QUERY)
print(f"Q: {QUERY}\nA: {result.response.strip()}\n\nSources:")
display([(n.text, n.metadata) for n in result.source_nodes])