<a href="https://colab.research.google.com/github/msquareddd/ai-engineering-notebooks/blob/main/RAG/test_rag_docling_llamaindex_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG with Docling/LlamaIndex/LangChain

# Dependencies

In [None]:
%pip install -q "pydantic>=2.12.0"
%pip install -q \
  llama-index-core \
  llama-index-llms-vllm \
  llama-index-readers-docling \
  llama-index-readers-file \
  llama-index-node-parser-docling \
  llama-index-embeddings-huggingface \
  llama-index-vector-stores-faiss \
  llama-index-embeddings-vllm \
  langchain \
  langchain-community \
  langchain-openai \
  llama-index-llms-langchain \
  vllm \
  transformers \
  bitsandbytes \
  accelerate \
  faiss-cpu \
  huggingface_hub \
  python-dotenv

In [None]:
import os
import sys
from pathlib import Path
from warnings import filterwarnings
from dotenv import load_dotenv
import faiss

filterwarnings(action="ignore", category=UserWarning, module="pydantic")
filterwarnings(action="ignore", category=FutureWarning, module="easyocr")

# Workaround for HuggingFace Tokenizers parallelism issues
os.environ["TOKENIZERS_PARALLELISM"] = "false"

print("Dependencies loaded successfully.")

# Hugging Face Login

In [None]:
from huggingface_hub import login
from google.colab import userdata, files

# Set up LangSmith (optional)
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = userdata.get('LANGSMITH_API_KEY')
os.environ["LANGSMITH_PROJECT"] = "Test RAG Docling/LlamaIndex"

# HuggingFace login
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

# Data Folder Creation

In [None]:
base_path = "/content/data"

if not os.path.exists(base_path):
    os.makedirs(base_path)
    print(f"Directory '{base_path}' created successfully.")
else:
    print(f"Directory '{base_path}' already exists.")

# Main Parameters

In [None]:
MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507"#"mistralai/Magistral-Small-2509"
VLLM_MODEL = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16" #"Qwen/Qwen2.5-14B-Instruct-AWQ" #"cpatonn/NVIDIA-Nemotron-Nano-12B-v2-AWQ-8bit" # # "Qwen/Qwen2.5-14B-Thinking-AWQ" "Qwen/Qwen2.5-14B-Instruct-AWQ" #"Qwen/Qwen2.5-14B-Instruct"
EMBED_MODEL = "google/embeddinggemma-300m" #"BAAI/bge-small-en-v1.5"
DB_PATH = "./data/faiss_index.bin"
MAX_NEW_TOKENS = 1024
MAX_MODEL_LEN = 4096*2
QUANT = None #"awq_marlin" #"compressed-tensors" #

# Using Hugging Face

In [None]:
# from transformers import AutoModelForCausalLM, BitsAndBytesConfig
# import torch
# from llama_index.llms.huggingface import HuggingFaceLLM

# quant_config_4bit = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.bfloat16,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
# )

# llm = HuggingFaceLLM(
#     model_name=MODEL,
#     tokenizer_name=MODEL,
#     context_window=8000, # Increased context window size
#     max_new_tokens=MAX_NEW_TOKENS,
#     model_kwargs={"quantization_config": quant_config_4bit, "dtype": torch.bfloat16},
#     generate_kwargs={"temperature": 0.7, "do_sample": True,"top_p": 0.95},
#     device_map="auto",
# )

# Using vLLM

## vLLM LangChain Wrapper

In [None]:
# from langchain_community.llms import VLLM

# llm = VLLM(model=VLLM_MODEL,
#            trust_remote_code=True,
#            max_new_tokens=MAX_NEW_TOKENS,
#            top_k=10,
#            top_p=0.95,
#            temperature=0.8,
#            tensor_parallel_size=1,
#            vllm_kwargs={
#             "quantization": QUANT,
#             "swap_space": 1,
#             "gpu_memory_utilization": 0.8,
#             "max_model_len": 4096,
#            }
# )


## vLLM Server for LangChain Tool Calling

In [None]:
!wget https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/resolve/main/nano_v3_reasoning_parser.py

In [None]:
import subprocess
import time
import sys
import requests
import os

# 1. Kill the previous server to free up Port 8000
print("Stopping old vLLM processes...")
!pkill -f vllm
time.sleep(5) # Give it a moment to release the port

# 2. Define the command
command = [
    sys.executable, "-m", "vllm.entrypoints.openai.api_server",
    "--model", VLLM_MODEL,
    "--dtype", "auto",
    "--trust-remote-code",
    "--port", "8000",
    "--gpu-memory-utilization", "0.8",
    "--max-model-len", str(MAX_MODEL_LEN),
    "--enable-auto-tool-choice",
    "--tool-call-parser", "qwen3_coder",
    "--reasoning-parser-plugin", "nano_v3_reasoning_parser.py",
    "--reasoning-parser","nano_v3",
]

# 3. Start the server in the background
with open("vllm_logs.txt", "w") as f:
    process = subprocess.Popen(command, stdout=f, stderr=f)

print("Starting vLLM server...")

# 4. Wait for the server to become ready
for i in range(360): # Wait up to 6 minutes
    try:
        response = requests.get("http://localhost:8000/health")
        if response.status_code == 200:
            print("\nvLLM Server is ready!")
            break
    except requests.exceptions.ConnectionError:
        pass
    print(".", end="", flush=True)
    time.sleep(1)
else:
    print("\nTimeout: Server did not start within 6 minutes.")
    print("Check content of 'vllm_logs.txt' for errors.")

In [None]:
from langchain_openai import ChatOpenAI

inference_server_url = "http://localhost:8000/v1"

llm = ChatOpenAI(
    model=VLLM_MODEL,
    openai_api_key="EMPTY",
    openai_api_base=inference_server_url,
    max_tokens=MAX_NEW_TOKENS,
    temperature=0.6,
    top_p=0.95,
    verbose=True,
)

# Embeddings

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embeddings = HuggingFaceEmbedding(model_name=EMBED_MODEL)

In [None]:
SOURCE = userdata.get('DOC_PATH')

print(f"Source document: {SOURCE}")

embed_dim = len(embeddings.get_text_embedding("hi"))

In [None]:
faiss_index = faiss.IndexFlatL2(embed_dim)
print("Index created successfully!")

In [None]:
def save_faiss_index(index, path):
    """Helper function to save FAISS index"""
    faiss.write_index(index, path)
    print(f"FAISS index saved to {path}")

In [None]:
def check_faiss_index_status(index):
    """Check the status of the FAISS index"""
    print(f"FAISS index contains {index.ntotal} vectors")
    print(f"Index dimension: {index.d}")

# Single file extraction for Testing

In [None]:
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.readers.docling import DoclingReader
from llama_index.vector_stores.faiss import FaissVectorStore

reader = DoclingReader()
node_parser = MarkdownNodeParser()

vector_store = FaissVectorStore(faiss_index=faiss_index)
print("Vector store created successfully!")

index = VectorStoreIndex.from_documents(
    documents=reader.load_data(SOURCE),
    transformations=[node_parser],
    storage_context=StorageContext.from_defaults(vector_store=vector_store),
    embed_model=embeddings,
)

save_faiss_index(faiss_index, DB_PATH)
check_faiss_index_status(faiss_index)

In [None]:
QUERY = "Che risultati ha dato l'analisi X fatta nel progetto Y"

result = index.as_query_engine(llm=llm, similarity_top_k=5).query(QUERY)

print("Query executed successfully!\n\n")

print(f"Q: {QUERY}\nA: {result.response.strip()}\n\nSources:")
display([(n.text, n.metadata) for n in result.source_nodes])

# Directory Reader

In [None]:
DIR_PATH = "/content/data"

In [None]:
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.readers.docling import DoclingReader
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core import SimpleDirectoryReader

faiss_index = faiss.IndexFlatL2(embed_dim)
print("Index reset successfully!")

vector_store = FaissVectorStore(faiss_index=faiss_index)
print("Vector store created successfully!")

reader = DoclingReader(export_type=DoclingReader.ExportType.MARKDOWN)
node_parser = MarkdownNodeParser()

dir_reader = SimpleDirectoryReader(
    input_dir=DIR_PATH,
    file_extractor={".pdf": reader,
                    ".docx": reader,
                    ".pptx": reader,
                    ".xlsx": reader,
                    },
)

index = VectorStoreIndex.from_documents(
    documents=dir_reader.load_data(),
    transformations=[node_parser],
    storage_context=StorageContext.from_defaults(vector_store=vector_store),
    embed_model=embeddings,
)

print("Index created successfully!")
save_faiss_index(faiss_index, DB_PATH)
check_faiss_index_status(faiss_index)

## Query Test on Directory

In [None]:
QUERY = "che colla Ã¨ stata usata nel progetto marelli?"

In [None]:
result = index.as_query_engine(llm=llm, similarity_top_k=5).query(QUERY)
print(f"Q: {QUERY}\nA: {result.response.strip()}\n\nSources:")
display([(n.text, n.metadata) for n in result.source_nodes])

In [None]:
retriever = index.as_retriever(similarity_top_k=5)
nodes = retriever.retrieve(QUERY)
context_text = "\n\n".join([n.text for n in nodes])
source_files = {n.metadata.get("file_name") for n in nodes if "file_name" in n.metadata}

print(context_text + "\n\nSources:\n- " + "\n- ".join(sorted(source_files)))

# System Prompts

## Prompt for create_agent

In [None]:
template = """You are a helpful, polite, and conversational AI assistant that answers user questions **only** using information retrieved through the provided tools.

## âœ… Rules (Follow Strictly)

1. **Always** call a tool before producing a final answer.
2. Base your **final answer only** on information retrieved from the tool.
3. If the documents do **not** contain the answer, say: "The requested information is not available in the searched documents."
4. Respond in the **same language** as the user's question.
5. You **must** include the **Sources** section verbatim at the end.
6. Never reveal chain-of-thought in the final answer.
7. **Always** include the sources you get from the search tool in the answer.
8. The tool might provide various informations, feel free to use just the relevant ones.

## ðŸ”§ Tools Available

You have access to the following tools:

- **Document Search**: Performs search on documents based on the user query.
Some documents are in English and some are in Italian, it might be useful running the query in both languages to maximize information.
You can call it multiple times if you think it could be useful to get more context.
"""

# LangChain Set-up

## Retriever

In [None]:
retriever = index.as_retriever(similarity_top_k=10)

## Tool Definition

In [None]:
# from langchain.tools import tool

# #@tool
# def query_and_format_results(query):
#     """ RAG tool for querying documents """
#     print("Invoking RAG tool")

#     nodes = retriever.retrieve(query)

#     context_text = "\n\n".join([n.text for n in nodes])
#     source_files = {n.metadata.get("file_name") for n in nodes if "file_name" in n.metadata}

#     if source_files:
#         return context_text + "\n\nSources:\n- " + "\n- ".join(sorted(source_files))
#     return context_text


In [None]:
from langchain.tools import tool

@tool(
    "document_search",
    parse_docstring=True,
    description=(
        "Perform search on documents based on the user query"
        "Use this whenever the user askes questions about documents"
    ),
)
def query_and_format_results(query):
    """ RAG tool for querying documents """
    print("Invoking RAG tool")

    nodes = retriever.retrieve(query)

    context_text = "\n\n".join([n.text for n in nodes])
    source_files = {n.metadata.get("file_name") for n in nodes if "file_name" in n.metadata}

    if source_files:
        return context_text + "\n\nSources:\n- " + "\n- ".join(sorted(source_files))
    return context_text

## Using create_agent




In [None]:
from langchain.tools import tool
from langchain.agents import create_agent
from dataclasses import dataclass
from langgraph.checkpoint.memory import InMemorySaver
from langchain_community.utilities import SQLDatabase


@dataclass
class RuntimeContext:
    db: SQLDatabase

agent = create_agent(
    model=llm,
    tools=[query_and_format_results],
    system_prompt=template,
    context_schema=RuntimeContext,
    checkpointer=InMemorySaver(),
)

# Gradio Interface

In [None]:
import gradio as gr


def chatbot_response(message, history):
  """
  Receives latest user message, returns agent's response as string.
  """
  result = agent.invoke({"messages": [{"role": "user", "content": message}]},
                        {"configurable": {"thread_id": "1"}},
                        )

  # return result.get("output", result.get("output_text", str(result)))
  return (result["messages"][-1].content)


interface = gr.ChatInterface(
    fn=chatbot_response,
    title="R&D Projects Chatbot",
    description="Ask questions about the R&D projects documents.",
    theme="ocean" # soft, glass, default
)

interface.launch(share=True, debug=True)