In [None]:
from db.pgvector_client import PgVectorIndexer
from data_ingestion.docling_llama_ingestor import DoclingLlamaIngestor
from core.config import settings
import os
import sys
from llama_index.node_parser.docling import DoclingNodeParser
# --- Import your other class and splitter ---
from llama_index.core.text_splitter import SentenceSplitter
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker


In [35]:
# Path to your large test file
test_file = r"C:\Work\RAG-Agent-Project\Documents To Ingest\Abu Dhabi Procurement Standards.PDF"

try:
    
    ingestor = DoclingLlamaIngestor()
    chunker = HybridChunker(
        chunk_size=1024,
        chunk_overlap=128
    )
    nodes = ingestor.parse_with_docling_node_parser(
        test_file,
        node_parser=DoclingNodeParser(chunker=chunker))
    print(f"Ingested {len(nodes)} nodes from document.")

except Exception as e:
    print(f"Failed to ingest document: {e}", file=sys.stderr)
    sys.exit(1)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
2025-10-25 23:07:11,935 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-25 23:07:11,987 - INFO - Going to convert document batch...
2025-10-25 23:07:11,994 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 9979fc6e2a83cb9fddbaf66b8dbf3dbb
2025-10-25 23:07:12,178 - INFO - command: C:\Program Files\Tesseract-OCR\tesseract.exe --list-langs
2025-10-25 23:07:12,299 - INFO - Accelerator device: 'cpu'
2025-10-25 23:07:13,689 - INFO - Accelerator device: 'cpu'
2025-10-25 23:07:14,917 - INFO - Processing document Abu Dhabi Procurement Standards.PDF
2025-10-25 23:07:15,478 - INFO - command: C:\Program Files\Tesseract-OCR\tesseract.exe --psm 0 -l osd C:\Users\mentz\AppData\Local\Temp\tmp3ol6mweq.png stdo

Ingested 159 nodes from document.


In [36]:
a = nodes[:10]
a

[TextNode(id_='500a2ffb-77db-474b-ada3-ce112ce71eee', embedding=None, metadata={'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/10', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 170.0, 't': 84.25337727864587, 'r': 195.0, 'b': 78.9200439453125, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 7]}]}, {'self_ref': '#/texts/11', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 197.33333333333334, 't': 84.25337727864587, 'r': 209.0, 'b': 78.9200439453125, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 3]}]}, {'self_ref': '#/texts/12', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 212.0, 't': 84.58671061197913, 'r': 224.33333333333334, 'b': 78.9200439453125, 'coord_origin': 'BOTTOMLEFT'}, 'chars

In [37]:
"""
pgvector_indexer.py

This class connects to the DoclingLlamaIngestor.
It takes the nodes from the ingestor and:
 1. Connects to a PostgreSQL database with PGVector.
 2. Initializes an embedding model (Ollama).
 3. Initializes a PGVectorStore, which creates the table (collection) if it doesn't exist.
 4. Provides methods to insert nodes, delete documents, and drop the collection.
 5. Creates an HNSW index for fast, cosine-distance search.
 6. Provides an advanced query method with filtering and reranking.

Install required packages:
    pip install llama-index llama-index-vector-stores-postgres llama-index-embeddings-ollama sqlalchemy psycopg2-binary
    pip install llama-index-postprocessor-sentence-transformers sentence-transformers
"""

import sys
from typing import List, Optional, Dict, Any
from urllib.parse import quote_plus

# Database and SQL
import sqlalchemy
from sqlalchemy import make_url, text, inspect

# LlamaIndex components
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    Settings,  # Updated from ServiceContext
)
from llama_index.core.schema import BaseNode
from llama_index.core.vector_stores import (
    MetadataFilters,
    ExactMatchFilter,
)
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.core.postprocessor import SentenceTransformerRerank

# Assuming the ingestor is in a parent 'data_ingestion' folder
# If it's in the same folder, you might use:
# from docling_llama_ingestor import DoclingLlamaIngestor
try:
    from data_ingestion.docling_llama_ingestor import DoclingLlamaIngestor
except ImportError:
    print("Warning: Could not import DoclingLlamaIngestor. "
          "Make sure it's in the correct path (e.g., data_ingestion/docling_llama_ingestor.py)", 
          file=sys.stderr)


class PgVectorIndexer:
    """
    Manages embedding nodes and indexing them in PGVector with
    advanced features like HNSW, filtering, and reranking.
    """

    def __init__(
        self,
        db_name: str,
        db_user: str,
        db_pass: str,
        db_host: str,
        db_port: int,
        collection_name: str,
        embed_model_name: str = "nomic-embed-text",
        ollama_base_url: str = "http://localhost:11434",
        reranker_model: str = "cross-encoder/ms-marco-minilm-l-6-v2",
    ):
        """
        Initializes the connection, embedding model, and vector store.
        """
        print(f"Initializing PgVectorIndexer for collection: '{collection_name}'")
        self.collection_name = collection_name

        # 1. Build the database connection string
        self.connection_string = (
            f"postgresql+psycopg2://{db_user}:{quote_plus(db_pass)}"
            f"@{db_host}:{db_port}/{db_name}"
        )
        # Create a persistent engine and inspector
        self.engine = sqlalchemy.create_engine(self.connection_string)
        self.inspector = inspect(self.engine)

        # 2. Initialize the Embedding Model
        print(f"Initializing embedding model: {embed_model_name}")
        self.embed_model = OllamaEmbedding(
            model_name=embed_model_name,
            base_url=ollama_base_url,
        )
        
        # --- Modernized LlamaIndex Setup (using Settings) ---
        # This sets the *global* embedding model for LlamaIndex
        Settings.embed_model = self.embed_model
        Settings.llm = None
        # --- NOTE ---
        # We are *not* setting Settings.llm here.
        # This means queries will fail *unless* synthesize_answer=False
        # is used in the query() method.
        
        # 3. Initialize the Reranker
        print(f"Initializing reranker: {reranker_model}")
        # This model is small and fast.
        self.reranker = SentenceTransformerRerank(
            model=reranker_model, top_n=3
        )

        # 4. Initialize the Vector Store
        print(f"Connecting to vector store and initializing collection '{collection_name}'...")
        embed_dim = self._get_embed_dim()
        
        self.vector_store = PGVectorStore.from_params(
            database=db_name,
            user=db_user,
            password=db_pass,
            host=db_host,
            port=db_port,
            table_name=collection_name,
            embed_dim=embed_dim,
            # HNSW settings for cosine distance
            hnsw_kwargs={
                "hnsw_m": 32,
                "hnsw_ef_construction": 128,
                "hnsw_ef_search": 64,
                "hnsw_dist_method": "vector_cosine_ops",
            },
        )

        # 5. Initialize Storage and Load Index
        self.storage_context = StorageContext.from_defaults(
            vector_store=self.vector_store
        )
        
        print("Loading index from vector store...")
        self.index = VectorStoreIndex.from_vector_store(
            vector_store=self.vector_store,
        )

    def _get_embed_dim(self) -> int:
        """Helper to get embedding dimension from the Ollama model."""
        try:
            test_embedding = self.embed_model.get_text_embedding("test")
            dim = len(test_embedding)
            print(f"Detected embedding dimension: {dim}")
            return dim
        except Exception as e:
            print(f"Error getting embedding dimension: {e}", file=sys.stderr)
            print("Defaulting to 768.", file=sys.stderr)
            return 768

    def is_collection_empty(self) -> bool:
        """Checks if the vector store's table is empty."""
        try:
            if not self.inspector.has_table(self.collection_name):
                print(f"Table '{self.collection_name}' does not exist yet.")
                return True
                
            with self.engine.connect() as connection:
                query = text(f'SELECT COUNT(*) FROM "{self.collection_name}"')
                result = connection.execute(query)
                count = result.scalar()
                print(f"Found {count} existing items in collection.")
                return count == 0
        except Exception as e:
            print(f"Error checking collection count: {e}", file=sys.stderr)
            return False

    def insert_nodes(
        self, nodes: List[BaseNode], force_reinsert: bool = False, create_hnsw: bool = True
    ):
        """
        Inserts nodes into the index, only if the collection is empty
        or if 'force_reinsert' is True.

        Args:
            nodes: The list of nodes from DoclingLlamaIngestor.
            force_reinsert: If True, inserts nodes even if the collection has data.
            create_hnsw: If True, attempts to create an HNSW index after inserting.
        """
        if not nodes:
            print("No nodes provided to index.", file=sys.stderr)
            return

        if force_reinsert or self.is_collection_empty():
            print("Collection is empty or re-insert forced. Inserting nodes...")
            # We use insert_nodes on the index object.
            self.index.insert_nodes(nodes)
            print(f"Successfully inserted {len(nodes)} nodes.")
            
            if create_hnsw:
                self.create_hnsw_index()
        else:
            print(
                "Collection already contains data and force_reinsert=False. "
                "Index is loaded and ready to query."
            )

    def create_hnsw_index(self):
        """
        Creates an HNSW index on the embedding column for fast cosine similarity search.
        This is much faster than flat indexing for large datasets.
        """
        index_name = f"hnsw_idx_{self.collection_name}"
        
        # Check if index already exists
        with self.engine.connect() as connection:
            check_idx_query = text(
                f"SELECT 1 FROM pg_indexes WHERE "
                f"tablename = '{self.collection_name}' AND "
                f"indexname = '{index_name}'"
            )
            index_exists = connection.execute(check_idx_query).scalar()

        if index_exists:
            print(f"HNSW index '{index_name}' already exists.")
            return

        print(f"Creating HNSW index '{index_name}'... This may take a few minutes.")
        
        # This command creates an HNSW index for cosine similarity (<=>)
        # HNSW is only available in pgvector >= 0.5.0
        # For older versions, you might use 'ivfflat'
        sql_command = text(
            f'CREATE INDEX {index_name} ON "{self.collection_name}" '
            f'USING hnsw (embedding vector_cosine_ops);'
        )
        
        try:
            with self.engine.connect() as connection:
                connection.execution_options(isolation_level="AUTOCOMMIT").execute(
                    sql_command
                )
            print("HNSW index created successfully.")
        except Exception as e:
            print(f"Error creating HNSW index: {e}", file=sys.stderr)
            print("Please ensure you have pgvector >= 0.5.0 installed in PostgreSQL.")

    def drop_collection(self):
        """
        Drops the entire collection (table) from the database.
        This is a destructive operation!
        """
        print(f"WARNING: Dropping collection '{self.collection_name}'...")
        try:
            with self.engine.connect() as connection:
                sql = text(f'DROP TABLE IF EXISTS "{self.collection_name}";')
                connection.execution_options(isolation_level="AUTOCOMMIT").execute(sql)
            print(f"Collection '{self.collection_name}' dropped successfully.")
        except Exception as e:
            print(f"Error dropping collection: {e}", file=sys.stderr)

    def delete_document(self, file_path: str):
        """
        Deletes all nodes associated with a specific file_path (ref_doc_id).
        
        Args:
            file_path: The 'source_file' metadata value of the document to delete.
        """
        print(f"Deleting all nodes for document: {file_path}")
        try:
            # LlamaIndex abstracts this away nicely
            self.index.delete_ref_doc(file_path, delete_from_docstore=True)
            print(f"Successfully deleted nodes for {file_path}.")
        except Exception as e:
            print(f"Error deleting document {file_path}: {e}", file=sys.stderr)

    def query(
        self,
        query_text: str,
        similarity_top_k: int = 5,
        filters: Optional[Dict[str, Any]] = None,
        rerank: bool = False,
        rerank_top_n: int = 3,
        synthesize_answer: bool = True,  # <-- MODIFIED: Added new parameter
    ):
        """
        Queries the loaded index with optional filtering and reranking.

        Args:
            query_text: The question to ask.
            similarity_top_k: The number of nodes to retrieve from the vector store.
            filters: A dictionary of exact-match metadata filters 
                     (e.g., {"page_label": "5", "element_type": "table"}).
            rerank: If True, use a reranker model on the retrieved nodes.
            rerank_top_n: The final number of nodes to return after reranking.
            synthesize_answer: If False, skips LLM call and only returns
                               retrieved/reranked source nodes. # <-- MODIFIED: Added docstring
        """
        if self.index is None:
            print("Index is not initialized.", file=sys.stderr)
            return None

        # 1. Build Metadata Filters
        llama_filters = None
        if filters:
            print(f"Applying filters: {filters}")
            filter_list = [
                ExactMatchFilter(key=k, value=v) for k, v in filters.items()
            ]
            llama_filters = MetadataFilters(filters=filter_list, condition="AND")

        # 2. Configure Reranker (if used)
        postprocessors = []
        if rerank:
            print(f"Reranking enabled (top_n={rerank_top_n})")
            self.reranker.top_n = rerank_top_n
            postprocessors.append(self.reranker)

        # 3. Build the Query Engine
        
        # <-- MODIFIED: Added response_mode logic -->
        # Determine the response mode. 
        # "no_text" skips the LLM synthesis step completely.
        response_mode = "compact" if synthesize_answer else "no_text"
        
        print(f"Building query engine with response_mode='{response_mode}'")
        # <-- END MODIFIED SECTION -->
        
        query_engine = self.index.as_query_engine(
            similarity_top_k=similarity_top_k,
            filters=llama_filters,
            node_postprocessors=postprocessors,
            response_mode=response_mode,  # <-- MODIFIED: Added response_mode
        )

        print(f"\n--- Querying Index ---")
        print(f"Query: {query_text}")
        response = query_engine.query(query_text)
        return response

In [None]:

# -----------------------------------------------------------------
# Example of how to use this class with your DoclingLlamaIngestor
# -----------------------------------------------------------------

if __name__ == "__main__":
   
    # --- 1. Database Configuration ---
    # !!! IMPORTANT: Update these with your PostgreSQL details !!!
    DB_CONFIG = {
        "db_host": "localhost",  # or your actual host
        "db_port": settings.DB_PORT,
        "db_user": settings.DB_USER,
        "db_pass": settings.DB_PASSWORD,
        "db_name": settings.DB_NAME,
    }
    # This will be the name of the table in your database
    COLLECTION_NAME = "doc_standards_hnsw"   

    # --- 3. Initialize Indexer ---
    print("\n--- Step 2: Initializing PGVector Indexer ---")
    try:
        indexer = PgVectorIndexer(
            **DB_CONFIG,
            collection_name=COLLECTION_NAME,
            embed_model_name="settings.EMBEDDING_MODEL",
        )
    except Exception as e:
        print(f"Failed to initialize indexer. Is PostgreSQL running? {e}", file=sys.stderr)
        sys.exit(1)

    # --- 4. Insert Nodes into Database ---
    print("\n--- Step 3: Inserting Nodes into PGVector ---")
    # Set force_reinsert=True if you want to re-upload the data
    indexer.insert_nodes(a, force_reinsert=False, create_hnsw=True)

    


--- Step 2: Initializing PGVector Indexer ---
Initializing PgVectorIndexer for collection: 'doc_standards_hnsw'
Initializing embedding model: embeddinggemma
LLM is explicitly disabled. Using MockLLM.
Initializing reranker: cross-encoder/ms-marco-minilm-l-6-v2
Connecting to vector store and initializing collection 'doc_standards_hnsw'...


2025-10-25 23:15:01,194 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"


Detected embedding dimension: 768
Loading index from vector store...

--- Step 3: Inserting Nodes into PGVector ---
Table 'doc_standards_hnsw' does not exist yet.
Collection is empty or re-insert forced. Inserting nodes...


2025-10-25 23:15:04,202 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-10-25 23:15:09,940 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-10-25 23:15:12,026 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-10-25 23:15:14,694 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-10-25 23:15:15,840 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-10-25 23:15:19,331 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-10-25 23:15:22,213 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-10-25 23:15:25,235 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-10-25 23:15:28,579 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-10-25 23:15:29,639 - IN

Successfully inserted 10 nodes.
Creating HNSW index 'hnsw_idx_doc_standards_hnsw'... This may take a few minutes.
Please ensure you have pgvector >= 0.5.0 installed in PostgreSQL.


Error creating HNSW index: (psycopg2.errors.UndefinedTable) relation "doc_standards_hnsw" does not exist

[SQL: CREATE INDEX hnsw_idx_doc_standards_hnsw ON "doc_standards_hnsw" USING hnsw (embedding vector_cosine_ops);]
(Background on this error at: https://sqlalche.me/e/20/f405)


In [39]:
# Add this to the end of your pgvector_indexer.py's __main__ block

print("\n--- Step 4: Running a Test Query ---")

# We set synthesize_answer=False so it only returns the source nodes
# This way, we don't need to have Settings.llm configured.
response = indexer.query(
    query_text="What are the procurement standards?",
    similarity_top_k=3,
    rerank=False,
    synthesize_answer=False  # <-- This just gets the nodes
)

if response.source_nodes:
    print(f"\nSuccessfully retrieved {len(response.source_nodes)} source nodes:")
    
    for i, node_with_score in enumerate(response.source_nodes):
        node = node_with_score.node
        print(f"\n--- Node {i+1} (Score: {node_with_score.score:.4f}) ---")
        
        # This will print the HUMAN-READABLE text
        print(f"Text: {node.get_text()[:300]}...") 
        
        # This will print the rich metadata, including page_label!
        print(f"Metadata: {node.metadata}") 
else:
    print("Query returned no results.")


--- Step 4: Running a Test Query ---
Building query engine with response_mode='no_text'

--- Querying Index ---
Query: What are the procurement standards?


2025-10-25 23:15:30,154 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"



Successfully retrieved 3 source nodes:

--- Node 1 (Score: 0.6304) ---
Text: 1. The Procurement Practitioners must act in the best interest of the government entity they represent and ensure required goods, services and projects are procured providing expected value to the government entity.
2. The Procurement Practitioners must carry out the duties and responsibilities assi...
Metadata: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/189', 'parent': {'$ref': '#/groups/3'}, 'children': [], 'content_layer': 'body', 'label': 'list_item', 'prov': [{'page_no': 8, 'bbox': {'l': 76.0, 't': 710.5867106119791, 'r': 538.6666666666666, 'b': 668.5867106119791, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 215]}]}, {'self_ref': '#/texts/190', 'parent': {'$ref': '#/groups/3'}, 'children': [], 'content_layer': 'body', 'label': 'list_item', 'prov': [{'page_no': 8, 'bbox': {'l': 75.66666666666667, 't': 655.2533772786459, 'r': 538.0, 

In [None]:

# Example 2: Query with Metadata Filtering
print("\n--- Query (Filtered) ---")
response_filtered = indexer.query(
    "What does the standard say about 'framework agreements'?",
    similarity_top_k=5,
    filters={"page_label": "21"}, # Look *only* on page 21
    rerank=False
)
print("\n--- Answer (Filtered to Page 21) ---")
if response_filtered.source_nodes:
    print(response_filtered)
    for node in response_filtered.source_nodes:
        print(f"  [Source Page: {node.metadata.get('page_label', 'N/A')}]")
else:
    print("No results found on page 21 for that query.")

# Example 3: Query with Reranking
print("\n--- Query (Reranked) ---")
response_reranked = indexer.query(
    "Explain the process for supplier pre-qualification",
    similarity_top_k=10,  # Retrieve 10 candidates
    rerank=True,
    rerank_top_n=3        # Return only the best 3
)
print("\n--- Answer (Reranked, Top 3) ---")
print(response_reranked)
print("Source nodes (reranked):")
for node in response_reranked.source_nodes:
    print(f"  [Source Page: {node.metadata.get('page_label', 'N/A')}, Score: {node.score:.4f}]")



In [1]:
from db.pgvector_client import PgVectorIndexer
from data_ingestion.docling_llama_ingestor import DoclingLlamaIngestor
from core.config import settings
import os
import sys
from llama_index.node_parser.docling import DoclingNodeParser
# --- Import your other class and splitter ---
from llama_index.core.text_splitter import SentenceSplitter
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker


  from .autonotebook import tqdm as notebook_tqdm


Loading config: DB_HOST=localhost, OLLAMA_URL=http://localhost:11434


In [2]:
# Path to your large test file
test_file = r"C:\Work\RAG-Agent-Project\Documents To Ingest\Abu Dhabi Procurement Standards.PDF"

try:
    
    ingestor = DoclingLlamaIngestor()
    chunker = HybridChunker(
        chunk_size=1024,
        chunk_overlap=128
    )
    nodes = ingestor.parse_with_docling_node_parser(
        test_file,
        node_parser=DoclingNodeParser(chunker=chunker))
    print(f"Ingested {len(nodes)} nodes from document.")

except Exception as e:
    print(f"Failed to ingest document: {e}", file=sys.stderr)
    sys.exit(1)

a = nodes[:10]

2025-10-25 23:43:23,190 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-25 23:43:25,145 - INFO - Going to convert document batch...
2025-10-25 23:43:25,150 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 9979fc6e2a83cb9fddbaf66b8dbf3dbb
2025-10-25 23:43:25,225 - INFO - Loading plugin 'docling_defaults'
2025-10-25 23:43:25,244 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-10-25 23:43:25,271 - INFO - Loading plugin 'docling_defaults'
2025-10-25 23:43:25,311 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-10-25 23:43:25,389 - INFO - command: C:\Program Files\Tesseract-OCR\tesseract.exe --list-langs
2025-10-25 23:43:25,488 - INFO - Accelerator device: 'cpu'
2025-10-25 23:43:27,376 - INFO - Accelerator device: 'cpu'
2025-10-25 23:43:28,300 - INFO - Processing document Abu Dhabi Procurement Standards.PDF
2025-10-25 23:43:28,900 - INFO - command: C:\Program Files\Tesseract-OCR\tess

Ingested 159 nodes from document.


In [None]:

# -----------------------------------------------------------------
# Example of how to use this class with your DoclingLlamaIngestor
# -----------------------------------------------------------------
if __name__ == "__main__":
   
    # --- 1. Database Configuration ---
    # !!! IMPORTANT: Update these with your PostgreSQL details !!!
    DB_CONFIG = {
        "db_host": "localhost",  # or your actual host
        "db_port": settings.DB_PORT,
        "db_user": settings.DB_USER,
        "db_pass": settings.DB_PASSWORD,
        "db_name": settings.DB_NAME,
    }
    # This will be the name of the table in your database
    COLLECTION_NAME = "doc_standards_hnsw"   

    # --- 3. Initialize Indexer ---
    print("\n--- Step 2: Initializing PGVector Indexer ---")
    try:
        indexer = PgVectorIndexer(
            **DB_CONFIG,
            collection_name=COLLECTION_NAME,
            embed_model_name="settings.EMBEDDING_MODEL",
        )
    except Exception as e:
        print(f"Failed to initialize indexer. Is PostgreSQL running? {e}", file=sys.stderr)
        sys.exit(1)

    # --- 4. Insert Nodes into Database ---
    print("\n--- Step 3: Inserting Nodes into PGVector ---")
    # Set force_reinsert=True if you want to re-upload the data
    indexer.insert_nodes(a, force_reinsert=False, create_hnsw=True)

    


--- Step 2: Initializing PGVector Indexer ---
Initializing PgVectorIndexer for collection: 'doc_standards_hnsw'
Initializing embedding model: embeddinggemma
LLM is explicitly disabled. Using MockLLM.
Initializing reranker: cross-encoder/ms-marco-minilm-l-6-v2
Connecting to vector store 'doc_standards_hnsw'...


2025-10-26 01:39:52,786 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"


Detected embedding dimension: 768
Connecting to docstore 'doc_standards_hnsw_docstore'...
Initializing storage context...
Loading index from storage context...


Failed to initialize indexer. Is PostgreSQL running? 'StorageContext' object has no attribute 'stores_text'


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
from db.pgvector_client import PgVectorIndexer
from data_ingestion.docling_llama_ingestor import DoclingLlamaIngestor
from core.config import settings
import os
import sys
from llama_index.node_parser.docling import DoclingNodeParser
# --- Import your other class and splitter ---
from llama_index.core.text_splitter import SentenceSplitter
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker


print("\n--- Step 4: Running a Test Query ---")
DB_CONFIG = {
        "db_host": "localhost",  # or your actual host
        "db_port": settings.DB_PORT,
        "db_user": settings.DB_USER,
        "db_pass": settings.DB_PASSWORD,
        "db_name": settings.DB_NAME,
    }
indexer = PgVectorIndexer(
            **DB_CONFIG,
            collection_name="default_collection",
            embed_model_name="settings.EMBEDDING_MODEL",
        )

  from .autonotebook import tqdm as notebook_tqdm


Loading config: DB_HOST=localhost, OLLAMA_URL=http://localhost:11434

--- Step 4: Running a Test Query ---
[PgVectorIndexer] initializing collection='default_collection'
[PgVectorIndexer] initializing embed model 'embeddinggemma' (ollama @ http://localhost:11434)
[PgVectorIndexer] ServiceContext not available â falling back to Settings
LLM is explicitly disabled. Using MockLLM.
[PgVectorIndexer] initializing reranker 'cross-encoder/ms-marco-minilm-l-6-v2'
[PgVectorIndexer] creating PGVectorStore for table 'default_collection'


2025-10-26 10:58:27,697 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"


[PgVectorIndexer] detected embedding dimension = 768
[PgVectorIndexer] creating PostgresDocumentStore for 'default_collection_docstore'
[PgVectorIndexer] creating StorageContext from vector_store + docstore
[PgVectorIndexer] loading VectorStoreIndex from storage context


In [3]:
# Add this to the end of your pgvector_indexer.py's __main__ block


# We set synthesize_answer=False so it only returns the source nodes
# This way, we don't need to have Settings.llm configured.
response = indexer.query(
    query_text="""google brain google brain""",
    similarity_top_k=10,
    rerank=False,
    synthesize_answer=False  # <-- This just gets the nodes
)

if response.source_nodes:
    print(f"\nSuccessfully retrieved {len(response.source_nodes)} source nodes:")
    
    for i, node_with_score in enumerate(response.source_nodes):
        node = node_with_score.node
        print(f"\n--- Node {i+1} (Score: {node_with_score.score:.4f}) ---")
        
        # This will print the HUMAN-READABLE text
        print(f"Text: {node.get_text()[:1000]}...") 
        
        # This will print the rich metadata, including page_label!
        print(f"Metadata: {node.metadata}") 
else:
    print("Query returned no results.")

[PgVectorIndexer] building query engine (similarity_top_k=10, response_mode=no_text)
[PgVectorIndexer] running query: google brain google brain


2025-10-26 10:58:35,357 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"


[PgVectorIndexer] retrieved 10 nodes

Successfully retrieved 10 source nodes:

--- Node 1 (Score: 0.6145) ---
Text: Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works....
Metadata: {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/1', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 124.66666666666667, 't': 718.3333333333334, 'r': 488.0, 'b': 679.6666666666666, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 173]}]}], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 2949302674760005271, 'filename': 'tmpwpwuy6wz1706.03762v7.pdf'}}

--- Node 2 (Score: 0.5183) ---
Text: Ashish Vaswani* Google Brain avaswani@google.com
Noam Shazeer* Google Brain noam@google.com
Llion Jones* Google Research llion@google.com
Niki Parmar* Goog

[PgVectorIndexer] building query engine (similarity_top_k=10, response_mode=compact)
[PgVectorIndexer] running query: how to train the model step by step
[PgVectorIndexer] retrieved 10 nodes


In [1]:
# Initialize the RAGAgent once
from rag_system.rag_agent import RAGAgent
from core.config import settings
rag_agent_instance = RAGAgent(
    model_name="ollama/qwen3:0.6b",
    base_url=settings.OLLAMA_URL
)

Loading config: DB_HOST=localhost, OLLAMA_URL=http://localhost:11434
[PgVectorIndexer] initializing collection='default_collection'
[PgVectorIndexer] initializing embed model 'embeddinggemma' (ollama @ http://localhost:11434)
[PgVectorIndexer] ServiceContext not available â falling back to Settings
LLM is explicitly disabled. Using MockLLM.
[PgVectorIndexer] initializing reranker 'cross-encoder/ms-marco-minilm-l-6-v2'


  from .autonotebook import tqdm as notebook_tqdm


[PgVectorIndexer] creating PGVectorStore for table 'default_collection'
[PgVectorIndexer] detected embedding dimension = 768
[PgVectorIndexer] creating PostgresDocumentStore for 'default_collection_docstore'
[PgVectorIndexer] creating StorageContext from vector_store + docstore
[PgVectorIndexer] loading VectorStoreIndex from storage context
Initializing RAGAgent with LLM: model='ollama/qwen3:0.6b', base_url='http://localhost:11434'


In [2]:

if __name__ == "__main__":
    
    # Run a query
    user_query = "what is attention mechanism in transformers?"
    if rag_agent_instance.llm: # Check if LLM initialized successfully
        final_answer = rag_agent_instance(user_query)
        print("\n--- Final Answer ---")
        print(final_answer)
    else:
        print("Could not run query because LLM failed to initialize.")

Kicking off RAG crew for query: 'what is attention mechanism in transformers?'

--- Final Answer ---
Attention mechanisms in transformers are a core component that enable the model to focus on specific parts of the input sequence. They work by calculating the weighted sum of input features, where the weights are determined by the probability of each position in the sequence. This allows the model to prioritize certain parts of the input, improving the accuracy of the output. The attention matrix is a key part of this mechanism, which dynamically adjusts the weight distribution across the input. This process is crucial for tasks like language modeling, where the model needs to understand context and dependencies in the input data. According to Dr. Jane Doe's detailed explanation in the attention-mechanism-detailed-2023.pdf, attention mechanisms are essential for optimizing information retrieval and contextual understanding in transformer-based models.


In [4]:
final_answer.raw

"Attention mechanisms in transformers are a core component that enable the model to focus on specific parts of the input sequence. They work by calculating the weighted sum of input features, where the weights are determined by the probability of each position in the sequence. This allows the model to prioritize certain parts of the input, improving the accuracy of the output. The attention matrix is a key part of this mechanism, which dynamically adjusts the weight distribution across the input. This process is crucial for tasks like language modeling, where the model needs to understand context and dependencies in the input data. According to Dr. Jane Doe's detailed explanation in the attention-mechanism-detailed-2023.pdf, attention mechanisms are essential for optimizing information retrieval and contextual understanding in transformer-based models."

In [None]:
from rag_system.tools import search_collection



[PgVectorIndexer] building query engine (similarity_top_k=3, response_mode=compact)
[PgVectorIndexer] running query: What is Google Brain?
[PgVectorIndexer] retrieved 3 nodes
Search result: {'text': "Context information is below.\n---------------------\nProvided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.\n\nheadings: ['5 Training']\n\nThis section describes the training regime for our models.\n\nheadings: ['Attention Is All You Need']\n\nAshish Vaswani* Google Brain avaswani@google.com\nNoam Shazeer* Google Brain noam@google.com\nLlion Jones* Google Research llion@google.com\nNiki Parmar* Google Research nikip@google.com\nAidan N. Gomez* + University of Toronto aidan@cs.toronto.edu\nJakob Uszkoreit* Google Research usz@google.com\nLukasz Kaiser* Google Brain lukaszkaiser@google.com\nIllia Polosukhin* +\nillia.polosukhin@gmail.com\n---------------------\nGiven the co

In [3]:
search_result = search_collection("what is attention mechanism", similarity_top_k=3)

[PgVectorIndexer] building query engine (similarity_top_k=3, response_mode=compact)
[PgVectorIndexer] running query: what is attention mechanism
[PgVectorIndexer] retrieved 3 nodes
Search result: {'text': "Context information is below.\n---------------------\nProvided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.\n\nheadings: ['5 Training']\n\nThis section describes the training regime for our models.\n\nheadings: ['6.3 English Constituency Parsing']\n\nTable 4: The Transformer generalizes well to English constituency parsing (Results are on Section 23 of WSJ)\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: what is attention mechanism\nAnswer: ", 'metadata': [{'text': 'Provided proper attribution is provided, Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journ

In [32]:
from phoenix.client import Client
import json
client = Client(base_url="http://localhost:6006")
a = client.prompts.get(prompt_identifier="expert-search-query-formulator").format()
a= json.loads(a.messages[0]['content'])
a

{'role': 'Expert Search Query Formulator',
 'goal': 'Analyze a user query to extract key concepts and execute a search against the knowledge base using the `search_collection` tool to find the most relevant text chunks.',
 'backstory': "You are a highly specialized AI assistant with one purpose: information retrieval. You are an expert at dissecting user queries into their core components (keywords, entities, intent). Your *sole function* is to use these components to query the `search_collection` tool.\n\n**CRITICAL CONSTRAINTS:**\n1. You MUST NOT answer the user's query yourself. You MUST NOT summarize, interpret, or editorialize the search results.\n3. Your final output for the crew must be only the raw, unmodified data returned by the `search_collection` tool. Your job is to find the information, not explain it.",
 'task': {'description': "Analyze the user's query: '{query}'. Deconstruct it into key search terms, concepts, and entities. Pass these terms to the `search_collection` t

In [41]:
phoenix_client = Client(base_url="http://localhost:6006")
import ast 

def get_prompts(prmot_identifier: str):
    prompt_data = phoenix_client.prompts.get(prompt_identifier=prmot_identifier).format()
    print(prompt_data.messages[0]['content'])
    prompt_data  = json.loads(str(prompt_data.messages[0]['content']))
    return prompt_data
a = get_prompts("insight-synthesizer-analyst")

{
    "role": "Insight Synthesizer & Analyst",
    "goal": "Synthesize retrieved text chunks into a single, coherent, and accurate answer to the user's original query, strictly adhering to the provided context.",
    "backstory": "You are an expert analyst and communicator. You receive a set of raw text chunks from a researcher. Your job is to be the 'human-facing' voice, responsible for synthesizing this information into a high-quality, trustworthy answer for the user.\n\n**YOUR GUIDING PRINCIPLES (NON-NEGOTIABLE):**\n1. **THE CONTEXT IS YOUR ONLY TRUTH:** You MUST base your answer 100% *exclusively* on the information present in the provided text chunks.\n2. **NO EXTERNAL KNOWLEDGE:** You MUST NOT use any external information, personal opinions, or make assumptions to 'fill in the blanks'. Your knowledge is limited *only* to the context provided.\n3. **HANDLE MISSING INFORMATION:** If the provided context is insufficient or does not contain the answer, you MUST explicitly state that 

In [None]:
{
    "role": "Expert Search Query Formulator",
    "goal": "Analyze a user query to extract key concepts and execute a search against the knowledge base using the `search_collection` tool to find the most relevant text chunks.",
    "backstory": "You are a highly specialized AI assistant with one purpose: information retrieval. You are an expert at dissecting user queries into their core components (keywords, entities, intent). Your *sole function* is to use these components to query the `search_collection` tool.\n\n**TOOL DESCRIPTION:**\nYou have access to the following tool:\n\n`search_collection`\n- `query_text` (str): The text query to search for.\n- `similarity_top_k` (int): Number of top similar documents to retrieve. Default is 10.\n- `filters` (dict): Optional filters to apply to the search (e.g., `{'source_document': 'manual_v2.pdf'}` or `{'topic': 'safety'}`). Default is None.\n- `rerank` (bool): Whether to rerank the results for higher relevance. Default is False.\n- `rerank_top_n` (int): Number of top results to consider for reranking. Default is 10.\n\n**EXAMPLE USAGE:**\nIf the user asks: 'What are the safety protocols mentioned in the 2023 compliance manual?'\nYou should analyze this and call the tool with a refined query,  make sure that you add information to enrch the user question so that be easy to find the right search use your knowlage to eenrch the user query  like this:\n`search_collection(query_text='safety protocols 2023 compliance', similarity_top_k=10)`\nThis example only shows the primary parameters. `filters`, `rerank`, and `rerank_top_n` will use their default values rerank is False.\n\n**CRITICAL CONSTRAINTS:**\n1. You MUST NOT answer the user's query yourself.\n2. You MUST NOT summarize, interpret, or editorialize the search results.\n3. Your final output for the crew MUST be *only* the raw, unmodified data returned by the `search_collection` tool. Your job is to find the information, not explain it.",
    "task": {
      "description": "Analyze the user's query: '{query}'. Deconstruct it into key search terms, concepts, and entities. Pass these terms to the `search_collection` tool to retrieve the most relevant factual excerpts from the knowledge base.",
      "expected_output": "The raw, exact, and unmodified output from the `search_collection` tool. This will be a collection of text chunks, including any source metadata provided by the tool. There should be NO conversational text, summary, or any other text added."
    }


}

{'role': 'Expert Search Query Formulator',
 'goal': 'Analyze a user query to extract key concepts and execute a search against the knowledge base using the `search_collection` tool to find the most relevant text chunks.',
 'backstory': "You are a highly specialized AI assistant with one purpose: information retrieval. You are an expert at dissecting user queries into their core components (keywords, entities, intent). Your *sole function* is to use these components to query the `search_collection` tool.\n\n**CRITICAL CONSTRAINTS:**\n1. You MUST NOT answer the user's query yourself. You MUST NOT summarize, interpret, or editorialize the search results.\n3. Your final output for the crew must be only the raw, unmodified data returned by the `search_collection` tool. Your job is to find the information, not explain it.",
 'task': {'description': "Analyze the user's query: '{query}'. Deconstruct it into key search terms, concepts, and entities. Pass these terms to the `search_collection` t