<a href="https://colab.research.google.com/github/msquareddd/ai-engineering-notebooks/blob/main/RAG/test_rag_docling_llamaindex_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG with Docling/LlamaIndex/LangChain

# Dependencies

In [None]:
%pip install -q --progress-bar off --no-warn-conflicts llama-index-llms-vllm llama-index-core llama-index-readers-docling llama-index-node-parser-docling llama-index-embeddings-huggingface llama-index-readers-file python-dotenv transformers bitsandbytes accelerate faiss-cpu llama-index-vector-stores-faiss vllm llama-index-embeddings-vllm huggingface_hub

In [None]:
%pip install -q langchain gradio langchain-community llama-index-llms-langchain

In [None]:
import os
from pathlib import Path
from warnings import filterwarnings
from dotenv import load_dotenv
import faiss

filterwarnings(action="ignore", category=UserWarning, module="pydantic")
filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
# # https://github.com/huggingface/transformers/issues/5486:
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.environ["HF_HOME"] = "model/"

# Hugging Face Login

In [None]:
from huggingface_hub import login
from google.colab import userdata, files

# Set up LangSmith (optional)
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = userdata.get('LANGSMITH_API_KEY')
os.environ["LANGSMITH_PROJECT"] = "Test RAG Docling/LlamaIndex"

# HuggingFace login
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

# Data Folder Creation

In [None]:
base_path = "/content/data"

if not os.path.exists(base_path):
    os.makedirs(base_path)
    print(f"Directory '{base_path}' created successfully.")
else:
    print(f"Directory '{base_path}' already exists.")

# Main Parameters

In [None]:
MODEL = "Qwen/Qwen3-30B-A3B-Instruct-2507"#"mistralai/Magistral-Small-2509"#"nvidia/NVIDIA-Nemotron-Nano-9B-v2" # "nemotron-mini:4b" "" ""
VLLM_MODEL = "Qwen/Qwen2.5-14B-Instruct-AWQ" # "Qwen/Qwen2.5-14B-Thinking-AWQ" "Qwen/Qwen2.5-14B-Instruct-AWQ" #"Qwen/Qwen2.5-14B-Instruct"
EMBED_MODEL = "google/embeddinggemma-300m" #"BAAI/bge-small-en-v1.5"
DB_PATH = "./data/faiss_index.bin"
MAX_NEW_TOKENS = 2048
QUANT = "awq_marlin"

# Using Hugging Face

In [None]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
from llama_index.llms.huggingface import HuggingFaceLLM

quant_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

llm = HuggingFaceLLM(
    model_name=MODEL,
    tokenizer_name=MODEL,
    context_window=8000, # Increased context window size
    max_new_tokens=MAX_NEW_TOKENS,
    model_kwargs={"quantization_config": quant_config_4bit, "dtype": torch.bfloat16},
    generate_kwargs={"temperature": 0.7, "do_sample": True,"top_p": 0.95},
    device_map="auto",
)

# Using vLLM

In [None]:
from langchain_community.llms import VLLM

llm = VLLM(model=VLLM_MODEL,
           trust_remote_code=True,  # mandatory for hf models
           max_new_tokens=MAX_NEW_TOKENS,
           top_k=10,
           top_p=0.95,
           temperature=0.8,
           tensor_parallel_size=1,
           vllm_kwargs={
            "quantization": QUANT,
            "swap_space": 1,
            "gpu_memory_utilization": 0.8,
            "max_model_len": 4096,
           }
)


# Embeddings

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embeddings = HuggingFaceEmbedding(model_name=EMBED_MODEL)

In [None]:
SOURCE = userdata.get('DOC_PATH')

print(f"Source document: {SOURCE}")

embed_dim = len(embeddings.get_text_embedding("hi"))

In [None]:
faiss_index = faiss.IndexFlatL2(embed_dim)
print("Index created successfully!")

In [None]:
def save_faiss_index(index, path):
    """Helper function to save FAISS index"""
    faiss.write_index(index, path)
    print(f"FAISS index saved to {path}")

In [None]:
def check_faiss_index_status(index):
    """Check the status of the FAISS index"""
    print(f"FAISS index contains {index.ntotal} vectors")
    print(f"Index dimension: {index.d}")

# Single file extraction for Testing

In [None]:
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.readers.docling import DoclingReader
from llama_index.vector_stores.faiss import FaissVectorStore

reader = DoclingReader()
node_parser = MarkdownNodeParser()

vector_store = FaissVectorStore(faiss_index=faiss_index)
print("Vector store created successfully!")

index = VectorStoreIndex.from_documents(
    documents=reader.load_data(SOURCE),
    transformations=[node_parser],
    storage_context=StorageContext.from_defaults(vector_store=vector_store),
    embed_model=embeddings,
)

save_faiss_index(faiss_index, DB_PATH)
check_faiss_index_status(faiss_index)

In [None]:
QUERY = "Che risultati ha dato l'analisi metallografica fatta nel progetto Marelli?"

result = index.as_query_engine(llm=llm, similarity_top_k=5).query(QUERY)

print("Query executed successfully!\n\n")

print(f"Q: {QUERY}\nA: {result.response.strip()}\n\nSources:")
display([(n.text, n.metadata) for n in result.source_nodes])

# Directory Reader

In [None]:
DIR_PATH = "/content/data"

In [None]:
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.readers.docling import DoclingReader
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core import SimpleDirectoryReader

faiss_index = faiss.IndexFlatL2(embed_dim)
print("Index reset successfully!")

vector_store = FaissVectorStore(faiss_index=faiss_index)
print("Vector store created successfully!")

reader = DoclingReader(export_type=DoclingReader.ExportType.MARKDOWN)
node_parser = MarkdownNodeParser()

dir_reader = SimpleDirectoryReader(
    input_dir=DIR_PATH,
    file_extractor={".pdf": reader,
                    ".docx": reader,
                    ".pptx": reader,
                    ".xlsx": reader,
                    },
)

index = VectorStoreIndex.from_documents(
    documents=dir_reader.load_data(),
    transformations=[node_parser],
    storage_context=StorageContext.from_defaults(vector_store=vector_store),
    embed_model=embeddings,
)

print("Index created successfully!")
save_faiss_index(faiss_index, DB_PATH)
check_faiss_index_status(faiss_index)

## Query Test on Directory

In [None]:
QUERY = "Can you summarize the test results in the ZF basekit project?"

In [None]:
result = index.as_query_engine(llm=llm, similarity_top_k=5).query(QUERY)
print(f"Q: {QUERY}\nA: {result.response.strip()}\n\nSources:")
display([(n.text, n.metadata) for n in result.source_nodes])

# System Prompts

In [None]:
from langchain_core.prompts import PromptTemplate

template = """You are a helpful and friendly AI assistant.

Follow these rules strictly:
1.  Be polite and conversational in your responses.
2.  Before answering, you must use the 'Document Search' tool to find relevant information.
3.  Base your answers *only* on the information retrieved from the tool. Do not use any of your internal knowledge.
4.  If the documents do not contain the answer, state that the information is not available.
5.  **Crucially, the 'Document Search' tool will provide a 'Sources' section in its output. You MUST include this 'Sources' section, verbatim and unaltered, at the very end of your final response.**
6.  Answer always in the same language of the question.

TOOLS:
------
You have access to the following tools:

{tools}

Use the following format for reasoning:

Question: {input}
Thought: you think about what to do
Action: one of [{tool_names}]
Action Input: the input for the action
Observation: result of the action
... (you can repeat Thought/Action/Action Input/Observation)
Thought: I now know the final answer
Final Answer: the answer to the original question

Begin!

{agent_scratchpad}

"""

In [None]:
template = """
# üß† System Prompt (Optimized for LangChain Agent)

You are a helpful, polite, and conversational AI assistant that answers user questions **only** using information retrieved through the `Document Search` tool.

---

## ‚úÖ Rules (Follow Strictly)

1. **Always** call the `Document Search` tool before producing a final answer.
   You may call it **multiple times** with different search queries.
2. Base your **final answer only** on information retrieved from the tool.
   Do **not** use internal knowledge or fabricate information.
3. If the documents do **not** contain the answer, say:
   **"The requested information is not available in the searched documents."**
4. Respond in the **same language** as the user's question.
5. The `Document Search` tool returns a **Sources** section.
   You **must** include that **Sources** section **verbatim and unaltered** at the **end** of your final answer.
6. Never reveal chain-of-thought.
   Only output structured action logs using the format below.

---

## üîß Tools Available

You have access to the following tools:

```
{tools}
```

---

## üß© Reasoning & Action Format

Use this exact structure to perform tool calls and produce your final answer:

```
Question: {input}

Thought: you think about what to do
Action: one of [{tool_names}]
Action Input: {{search query}}
Observation: {{result}}

# You may repeat Thought ‚Üí Action ‚Üí Action Input ‚Üí Observation as needed.

Thought: I now know the final answer
Final Answer: {{answer strictly based on observations}}

Sources:
{{verbatim Sources section from the final tool output}}
```

- You are allowed and encouraged to run **multiple search queries**, trying different variations, synonyms, keyword versions, broader/narrower phrasing, etc.
- Do **not** modify the `Sources` text.

---

## üîç Query Variation Guidance

When performing searches, you may generate variations of the user query such as:

- The exact user question
- Synonym-expanded versions
- Rephrased or keyword-based versions
- Broader or narrower versions
- Versions with or without filters/qualifiers

Stop once you have found enough information to answer.

---

## üö´ Failure Handling

If the tool returns an error or no data, say:

**"Document Search failed: {{error message}}"**
or
**"The requested information is not available in the searched documents."**

Do **not** answer using internal knowledge.

---

## ‚ñ∂Ô∏è Begin

{agent_scratchpad}
"""

# LangChain Set-up

In [None]:
from langchain_core.messages import SystemMessage
from langchain.tools import Tool
from langchain.memory import ConversationBufferMemory
from langchain.agents import AgentExecutor, create_react_agent


query_engine = index.as_query_engine(llm=llm, similarity_top_k=5)

def query_and_format_results(query):
    result = query_engine.query(query)
    answer = result.response.strip()

    source_files = {n.metadata.get("file_name") for n in result.source_nodes if "file_name" in n.metadata}
    if source_files:
        return answer + "\n\nSources:\n- " + "\n- ".join(sorted(source_files))
    return answer

tools = [
    Tool(
        name="Document Search",
        func=query_and_format_results,
        description="Use this to answer questions using the uploaded documents.",
    )
]

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

prompt = PromptTemplate(
    input_variables=["input", "tools", "tool_names", "agent_scratchpad"],
    template=template,
    tools=tools,
)

react_agent = create_react_agent(
    llm=llm,
    tools=tools,
    prompt=prompt,
)

agent_chain = AgentExecutor(
    agent=react_agent,
    tools=tools,
    memory=memory,
    verbose=True,
    handle_parsing_errors=True,
)

# Gradio Interface

In [None]:
import gradio as gr


def chatbot_response(message, history):
  """
  Receives latest user message, returns agent's response as string.
  """
  result = agent_chain.invoke({"input": message})
  return result.get("output", result.get("output_text", str(result)))


interface = gr.ChatInterface(
    fn=chatbot_response,
    title="R&D Projects Chatbot",
    description="Ask questions about the R&D projects documents.",
    theme="ocean" # soft, glass, default
)

interface.launch(share=True, debug=True)