<a href="https://colab.research.google.com/github/msquareddd/ai-engineering-notebooks/blob/main/RAG/test_rag_docling_llamaindex_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG with Docling/LlamaIndex

In [None]:
%pip install -q --progress-bar off --no-warn-conflicts llama-index-llms-vllm llama-index-core llama-index-readers-docling llama-index-node-parser-docling llama-index-embeddings-huggingface llama-index-readers-file python-dotenv transformers bitsandbytes accelerate faiss-cpu llama-index-vector-stores-faiss vllm llama-index-embeddings-vllm huggingface_hub

In [None]:
%pip install -q langchain gradio

In [None]:
import os
from pathlib import Path
from warnings import filterwarnings
from dotenv import load_dotenv
import faiss


filterwarnings(action="ignore", category=UserWarning, module="pydantic")
filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
# https://github.com/huggingface/transformers/issues/5486:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_HOME"] = "model/"

In [None]:
from huggingface_hub import login
from google.colab import userdata, files

# Set up LangSmith (optional)
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = userdata.get('LANGSMITH_API_KEY')
os.environ["LANGSMITH_PROJECT"] = "Test RAG Docling/LlamaIndex"

# HuggingFace login
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

We can now define the main parameters:

In [None]:
MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507"#"mistralai/Magistral-Small-2509"#"nvidia/NVIDIA-Nemotron-Nano-9B-v2" # "nemotron-mini:4b" "" ""
VLLM_MODEL = "Qwen/Qwen2.5-14B-Instruct"
EMBED_MODEL = "google/embeddinggemma-300m" #"BAAI/bge-small-en-v1.5"
DB_PATH = "./data/faiss_index.bin"
MAX_NEW_TOKENS = 1024

# Using Hugging Face

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
from llama_index.llms.huggingface import HuggingFaceLLM

quant_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

llm = HuggingFaceLLM(
    model_name=MODEL,
    tokenizer_name=MODEL,
    context_window=8000, # Increased context window size
    max_new_tokens=MAX_NEW_TOKENS,
    model_kwargs={"quantization_config": quant_config_4bit, "dtype": torch.bfloat16},
    generate_kwargs={"temperature": 0.7, "do_sample": True,"top_p": 0.95},
    device_map="auto",
)

# Using vLLM

In [None]:
from llama_index.llms.vllm import Vllm

llm = Vllm(
    model=VLLM_MODEL,
    dtype="float16",
    tensor_parallel_size=1,
    temperature=0.9,
    max_new_tokens=MAX_NEW_TOKENS,
    vllm_kwargs={
        "swap_space": 1,
        "gpu_memory_utilization": 0.6,
        "max_model_len": 4096,
    },
)

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embeddings = HuggingFaceEmbedding(model_name=EMBED_MODEL)

In [None]:
SOURCE = userdata.get('DOC_PATH')

print(f"Source document: {SOURCE}")

embed_dim = len(embeddings.get_text_embedding("hi"))

In [None]:
# if os.path.exists(DB_PATH):
#     # Load existing index
#     faiss_index = faiss.read_index(DB_PATH)
#     print("Index loaded successfully!")
# else:
#     # Create new index
faiss_index = faiss.IndexFlatL2(embed_dim)
print("Index created successfully!")

In [None]:
def save_faiss_index(index, path):
    """Helper function to save FAISS index"""
    faiss.write_index(index, path)
    print(f"FAISS index saved to {path}")

In [None]:
def check_faiss_index_status(index):
    """Check the status of the FAISS index"""
    print(f"FAISS index contains {index.ntotal} vectors")
    print(f"Index dimension: {index.d}")

# Single file extraction

In [None]:
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.readers.docling import DoclingReader
from llama_index.vector_stores.faiss import FaissVectorStore

reader = DoclingReader()
node_parser = MarkdownNodeParser()

vector_store = FaissVectorStore(faiss_index=faiss_index)
print("Vector store created successfully!")

index = VectorStoreIndex.from_documents(
    documents=reader.load_data(SOURCE),
    transformations=[node_parser],
    storage_context=StorageContext.from_defaults(vector_store=vector_store),
    embed_model=embeddings,
)

save_faiss_index(faiss_index, DB_PATH)
check_faiss_index_status(faiss_index)

In [None]:
QUERY = "Che risultati ha dato l'analisi metallografica fatta nel progetto Marelli?"

result = index.as_query_engine(llm=llm, similarity_top_k=5).query(QUERY)

print("Query executed successfully!\n\n")

print(f"Q: {QUERY}\nA: {result.response.strip()}\n\nSources:")
display([(n.text, n.metadata) for n in result.source_nodes])

# Directory Reader

In [None]:
DIR_PATH = "/content/data"

In [None]:
from llama_index.core import SimpleDirectoryReader

faiss_index = faiss.IndexFlatL2(embed_dim)
print("Index reset successfully!")

vector_store = FaissVectorStore(faiss_index=faiss_index)
print("Vector store created successfully!")

reader = DoclingReader(export_type=DoclingReader.ExportType.MARKDOWN)
node_parser = MarkdownNodeParser()

dir_reader = SimpleDirectoryReader(
    input_dir=DIR_PATH,
    file_extractor={".pdf": reader,
                    ".docx": reader,
                    ".pptx": reader,
                    ".xlsx": reader,
                    },
)

index = VectorStoreIndex.from_documents(
    documents=dir_reader.load_data(),
    transformations=[node_parser],
    storage_context=StorageContext.from_defaults(vector_store=vector_store),
    embed_model=embeddings,
)

print("Index created successfully!")
save_faiss_index(faiss_index, DB_PATH)
check_faiss_index_status(faiss_index)

In [None]:
QUERY = "In the marelli project, what were the results on the test mad on the rings?"

In [None]:
result = index.as_query_engine(llm=llm, similarity_top_k=5).query(QUERY)
print(f"Q: {QUERY}\nA: {result.response.strip()}\n\nSources:")
display([(n.text, n.metadata) for n in result.source_nodes])

# LangChain Set-up

In [None]:
# Add this new import
from langchain.llms import LlamaIndexLLM
from langchain.prompts import SystemMessage

from langchain.agents import Tool
from langchain.chains.conversation.memory import ConversationBufferMemory
from langchain.agents import initialize_agent

# 1. Convert the LlamaIndex query engine into a LangChain tool
query_engine = index.as_query_engine(llm=llm, similarity_top_k=5)

# 2. Define the generic RAG tool for the LangChain agent
tools = [
    Tool(
        name="Document Search",
        func=lambda q: str(query_engine.query(q)),
        description="Useful for finding and answering questions about the information contained in the uploaded documents.",
    ),
]

# 3. Set up memory for the chatbot
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# 4. Wrap the LlamaIndex LLM for use in LangChain
langchain_llm = LlamaIndexLLM(llm=llm)

# 5. DEFINE THE SYSTEM PROMPT
system_prompt_content = """You are a helpful and friendly AI assistant.

Your primary purpose is to answer questions based on the information available in the documents provided through the 'Document Search' tool.

Follow these rules strictly:
1.  Be polite and conversational in your responses.
2.  Before answering, you must use the 'Document Search' tool to find relevant information.
3.  Base your answers *only* on the information retrieved from the tool. Do not use any of your internal knowledge.
4.  If the documents do not contain the answer to a question, you must clearly state that the information is not available in the provided documents instead of trying to make up an answer.
"""

# 6. Set up agent_kwargs with the system message
agent_kwargs = {
    "system_message": SystemMessage(content=system_prompt_content)
}

# 7. Initialize the LangChain agent with the system prompt
agent_chain = initialize_agent(
    tools,
    langchain_llm,
    agent="conversational-react-description",
    memory=memory,
    verbose=True,
    agent_kwargs=agent_kwargs,
)


## With Sources

In [None]:
# Add this new import
from langchain.llms import LlamaIndexLLM
from langchain.prompts import SystemMessage

from langchain.agents import Tool
from langchain.chains.conversation.memory import ConversationBufferMemory
from langchain.agents import initialize_agent

# 1. Convert the LlamaIndex query engine into a LangChain tool
query_engine = index.as_query_engine(llm=llm, similarity_top_k=5)

# 2. Define the generic RAG tool for the LangChain agent
#    This function now formats the sources into the final string.
def query_and_format_results(query: str) -> str:
    """
    Queries the LlamaIndex engine and formats the response with sources.
    """
    result = query_engine.query(query)
    answer = result.response.strip()

    # Extract unique source filenames from metadata
    source_files = set()
    for node in result.source_nodes:
        # The filename is usually stored in the metadata
        if 'file_name' in node.metadata:
            source_files.add(node.metadata['file_name'])

    # Format the final output string
    if source_files:
        sources_text = "\n\nSources:\n- " + "\n- ".join(sorted(list(source_files)))
        return f"{answer}{sources_text}"
    else:
        return answer

tools = [
    Tool(
        name="Document Search",
        func=query_and_format_results,
        description="Useful for finding and answering questions about the information contained in the uploaded documents. Provides the answer and the source documents.",
    ),
]

# 3. Set up memory for the chatbot
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# 4. Wrap the LlamaIndex LLM for use in LangChain
langchain_llm = LlamaIndexLLM(llm=llm)

# 5. DEFINE THE UPDATED SYSTEM PROMPT
system_prompt_content = """You are a helpful and friendly AI assistant named 'DocuBot'.

Your primary purpose is to answer questions based on the information available in the documents provided through the 'Document Search' tool.

Follow these rules strictly:
1.  Be polite and conversational in your responses.
2.  Before answering, you must use the 'Document Search' tool to find relevant information.
3.  Base your answers *only* on the information retrieved from the tool. Do not use any of your internal knowledge.
4.  If the documents do not contain the answer, state that the information is not available.
5.  **Crucially, the 'Document Search' tool will provide a 'Sources' section in its output. You MUST include this 'Sources' section, verbatim and unaltered, at the very end of your final response.**
6.  Answer always in the same language of the question.
"""

# 6. Set up agent_kwargs with the system message
agent_kwargs = {
    "system_message": SystemMessage(content=system_prompt_content)
}

# 7. Initialize the LangChain agent with the system prompt
agent_chain = initialize_agent(
    tools,
    langchain_llm,
    agent="conversational-react-description",
    memory=memory,
    verbose=True,
    agent_kwargs=agent_kwargs,
)

# Gradio Interface

In [None]:
import gradio as gr

def chatbot_response(message, history):
    """
    Function to get the response from the LangChain agent.
    """
    response = agent_chain.run(input=message)
    return response

interface = gr.ChatInterface(
    fn=chatbot_response,
    title="R&D Projects Chatbot",
    description="Ask questions about the R&D projects documents.",
    theme="soft"
)

interface.launch(share=True, debug=True) # Set share=True to get a public link