In [1]:

from document_loader import load_work_instructions
from text_processing import dynamic_text_splitter, get_or_cache_embeddings
from vector_db import setup_qdrant
from retriever import setup_work_retriever
from reranker import rerank_documents
from config import RESOURCES_PATH
from langchain_ollama import OllamaEmbeddings
from langsmith import traceable, Client

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/zamlamb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/zamlamb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Extracted PDF table data: [[['Work Instruction\nDocument Code: ZF-IT-POL-R002\nDate : 14/11/2022\nVersion: 2', 'Manual Loading of Solids']], [{'1': '2', 'Ensure a vacuum of 700 mb in R-002. Place the portable hopper next to the reactor.': 'Wear the specific PPE: neoprene gloves and a face shield for the helmet.'}, {'1': '3', 'Ensure a vacuum of 700 mb in R-002. Place the portable hopper next to the reactor.': 'Remove the inlet cap from the reactor and the hose cap from the portable hopper.'}], [{'4': '5', 'Connect the hose to the reactor inlet.': 'Open the hopper lid and load the sack. Open the manual valve under the portable\nhopper to approximately 40º to allow air entry.'}, {'4': '6', 'Connect the hose to the reactor inlet.': 'Request the opening of valve H20202.'}, {'4': '7', 'Connect the hose to the reactor inlet.': 'Once the hopper is empty, request the closure of valve H20202.'}, {'4': '8', 'Connect the hose to the reactor inlet.': 'Close the hopper lid and the lower valve. Disc

  warn(


In [2]:
# Ensure Qdrant is running before executing this notebook
client = Client()

In [3]:
### Step 1: Validate Document Ingestion
print("\n### Step 1: Document Ingestion\n")
work_docs = load_work_instructions(RESOURCES_PATH)
for i, doc in enumerate(work_docs[:3]):
    print(f"\n=== Document {i+1} ===")
    print(f"Metadata: {doc.metadata}")
    print(f"Content (first 500 chars): {doc.page_content[:500]}...\n")


### Step 1: Document Ingestion

Extracted PDF table data: [[['Work Instruction\nDocument Code: ZF-IT-POL-R002\nDate : 14/11/2022\nVersion: 2', 'Manual Loading of Solids']], [{'1': '2', 'Ensure a vacuum of 700 mb in R-002. Place the portable hopper next to the reactor.': 'Wear the specific PPE: neoprene gloves and a face shield for the helmet.'}, {'1': '3', 'Ensure a vacuum of 700 mb in R-002. Place the portable hopper next to the reactor.': 'Remove the inlet cap from the reactor and the hose cap from the portable hopper.'}], [{'4': '5', 'Connect the hose to the reactor inlet.': 'Open the hopper lid and load the sack. Open the manual valve under the portable\nhopper to approximately 40º to allow air entry.'}, {'4': '6', 'Connect the hose to the reactor inlet.': 'Request the opening of valve H20202.'}, {'4': '7', 'Connect the hose to the reactor inlet.': 'Once the hopper is empty, request the closure of valve H20202.'}, {'4': '8', 'Connect the hose to the reactor inlet.': 'Close the hop

In [5]:
### Step 2: Validate Document Splitting
print("\n### Step 2: Document Splitting\n")
# Updated chunk size to 500 tokens with content-aware splitting.
work_chunks = dynamic_text_splitter(work_docs, default_chunk_size=500)
for i, chunk in enumerate(work_chunks[:5]):
    print(f"\n=== Chunk {i+1} ===")
    print(f"Metadata: {chunk.metadata}")
    print(f"Content (first 500 chars): {chunk.page_content[:500]}\n")



### Step 2: Document Splitting


=== Chunk 1 ===
Metadata: {'format': 'PDF 1.7', 'title': '', 'author': 'Ricard Torralba', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word for Microsoft 365', 'producer': 'Microsoft® Word for Microsoft 365', 'creationDate': "D:20250206230521+01'00'", 'modDate': "D:20250206230521+01'00'", 'trapped': '', 'encryption': None, 'Document Name': 'Work Instruction - Manual Load Solids', 'Step Number': None, 'Author': 'Ricard Torralba', 'Created Date': "D:20250206230521+01'00'", 'Keywords': [''], 'Process Type': None, 'Safety Measures': [], 'Version': '2', 'Date': '14/11/2022'}
Content (first 500 chars): AG Solution Spain S.A.U. C. Pujades 350, Planta 4, Puerta 1
08019 Barcelona – Spain
www.agsolutiongroup.com
+34 93 624 02 75
ES A 65931651
Work Instruction
Document Code: ZF-IT-POL-R002
Manual Loading of Solids
Date : 14/11/2022
Version: 2
Ensure a vacuum of 700 mb in R-002. Place the portable hopper next to the reactor. 1
Wear the specific PPE: neopr

In [6]:
### Step 3: Validate Embedding Generation
print("\n### Step 3: Embedding Generation\n")
embedding_model = OllamaEmbeddings(model="paraphrase-multilingual")
embeddings = get_or_cache_embeddings(work_chunks, "workinst", embedding_model)
print(f"Total embeddings generated: {len(embeddings)}")
print(f"Example embedding vector (first 5 values): {embeddings[0][:5]}")


### Step 3: Embedding Generation

Generating workinst embeddings...
workinst embeddings cached.
Total embeddings generated: 37
Example embedding vector (first 5 values): [-0.0340843, 0.012698227, 1.2655666e-08, -0.003103407, 0.010113764]


In [8]:
from qdrant_client import QdrantClient
from config import WORK_COLLECTION

### Step 4: Validate Vector Database Storage
print("\n### Step 4: Vector Database Storage Validation\n")

# 1. First verify source documents have metadata
print("=== Source Document Metadata Check ===")
for i, chunk in enumerate(work_chunks[:3]):
    print(f"Document {i+1} Metadata:")
    print(f"• Keys: {list(chunk.metadata.keys())}")
    print(f"• Sample Values: { {k:v for k,v in chunk.metadata.items() if k in ['Document Name', 'Version']} }")
    print(f"• Content Start: {chunk.page_content[:100]}...\n")

# 2. Setup vector store
vector_store = setup_qdrant(work_chunks)

# 3. Verify Qdrant collection configuration
client = QdrantClient("localhost", port=6333)
collection_info = client.get_collection(WORK_COLLECTION)
print("\n=== Qdrant Collection Schema ===")
print(f"Indexed Fields: {collection_info.payload_schema}")
print(f"Vector Size: {collection_info.config.params.vectors.size}")

# 4. Direct payload inspection with content matching
print("\n=== Qdrant Storage Deep Validation ===")
records = client.scroll(
    collection_name=WORK_COLLECTION,
    with_payload=True,
    with_vectors=False,
    limit=3
)

for i, record in enumerate(records[0][:3]):
    print(f"\nDocument {i+1} Full Payload:")
    for key, value in record.payload.items():
        print(f"│ {key}: {str(value)[:100]}{'...' if len(str(value)) > 100 else ''}")

    # Find matching source document
    source_doc = next((d for d in work_chunks
                      if d.page_content.startswith(record.payload.get('page_content','')[:50])), None)

    if source_doc:
        print("✓ Content matches source document")
        print("Metadata Comparison:")
        print(f"│ Qdrant: {record.payload.get('metadata', {})}")
        print(f"└ Source: {source_doc.metadata}")
    else:
        print("⚠️ No matching source document found!")

    print("━" * 50)

print("Vector database work-validation complete.")


### Step 4: Vector Database Storage Validation

=== Source Document Metadata Check ===
Document 1 Metadata:
• Keys: ['format', 'title', 'author', 'subject', 'keywords', 'creator', 'producer', 'creationDate', 'modDate', 'trapped', 'encryption', 'Document Name', 'Step Number', 'Author', 'Created Date', 'Keywords', 'Process Type', 'Safety Measures', 'Version', 'Date']
• Sample Values: {'Document Name': 'Work Instruction - Manual Load Solids', 'Version': '2'}
• Content Start: AG Solution Spain S.A.U. C. Pujades 350, Planta 4, Puerta 1
08019 Barcelona – Spain
www.agsolutiongr...

Document 2 Metadata:
• Keys: ['format', 'title', 'author', 'subject', 'keywords', 'creator', 'producer', 'creationDate', 'modDate', 'trapped', 'encryption', 'Document Name', 'Step Number', 'Author', 'Created Date', 'Keywords', 'Process Type', 'Safety Measures', 'Version', 'Date']
• Sample Values: {'Document Name': 'Work Instruction - Manual Load Solids', 'Version': '2'}
• Content Start: 3
Antwerp – Paris - Barcelo

In [9]:
### Step 5: Validate Retrieval & Reranking
print("\n### Step 5: Retrieval & Reranking\n")

# Initialize DocumentAgentWork which handles model selection
from document_agent import DocumentAgentWork
doc_agent = DocumentAgentWork(debug_mode=True)

# Use the LLM instance from DocumentAgentWork
llm_instance = doc_agent.llm_instance
retriever = setup_work_retriever(work_chunks)
query = "How do I handle hypophosphorous acid?"

@traceable(project_name="workstations", client=client)
def trace_reranking(query: str, retrieved_docs: dict, llm) -> None:
    """Trace and display reranked documents for the query."""
    if "source_documents" in retrieved_docs and retrieved_docs["source_documents"]:
        # Phase 1: Verify raw retrieved metadata
        print("\n=== Phase 1: Raw Retrieved Metadata ===")
        for i, doc in enumerate(retrieved_docs["source_documents"][:3]):
            print(f"Document {i+1} Metadata:", getattr(doc, 'metadata', 'No metadata'))
            print(f"Content Preview: {doc.page_content[:100]}...\n")

        # Phase 2: Verify conversion to reranker format
        document_dicts = [
            {
                "page_content": doc.page_content,
                "metadata": getattr(doc, 'metadata', {})
            }
            for doc in retrieved_docs["source_documents"]
        ]
        print("\n=== Phase 2: Reranker Input Metadata ===")
        for i, doc in enumerate(document_dicts[:3]):
            print(f"Document {i+1} Metadata:", doc.get('metadata', 'No metadata'))
            print(f"Content Preview: {doc['page_content'][:100]}...\n")

        # Phase 3: Verify post-reranking metadata
        reranked_docs = rerank_documents(query, document_dicts, llm)
        print("\n=== Phase 3: Post-Reranking Metadata ===")
        for i, doc in enumerate(reranked_docs[:3]):
            print(f"Reranked {i+1} Metadata:", doc.get('metadata', 'No metadata'))
            print(f"Content Preview: {doc['page_content'][:100]}...\n")
    else:
        print("No relevant documents retrieved for reranking.")

# Retrieve and rerank documents
retrieved_docs = retriever.invoke({"query": query})
trace_reranking(query, retrieved_docs, llm_instance)


### Step 5: Retrieval & Reranking



  self._ollama = Ollama(model=model, temperature=temperature)


Extracted PDF table data: [[['Work Instruction\nDocument Code: ZF-IT-POL-R002\nDate : 14/11/2022\nVersion: 2', 'Manual Loading of Solids']], [{'1': '2', 'Ensure a vacuum of 700 mb in R-002. Place the portable hopper next to the reactor.': 'Wear the specific PPE: neoprene gloves and a face shield for the helmet.'}, {'1': '3', 'Ensure a vacuum of 700 mb in R-002. Place the portable hopper next to the reactor.': 'Remove the inlet cap from the reactor and the hose cap from the portable hopper.'}], [{'4': '5', 'Connect the hose to the reactor inlet.': 'Open the hopper lid and load the sack. Open the manual valve under the portable\nhopper to approximately 40º to allow air entry.'}, {'4': '6', 'Connect the hose to the reactor inlet.': 'Request the opening of valve H20202.'}, {'4': '7', 'Connect the hose to the reactor inlet.': 'Once the hopper is empty, request the closure of valve H20202.'}, {'4': '8', 'Connect the hose to the reactor inlet.': 'Close the hopper lid and the lower valve. Disc

INFO:azure.identity._credentials.environment:Incomplete environment configuration for EnvironmentCredential. These variables are set: AZURE_TENANT_ID
INFO:azure.identity._credentials.managed_identity:ManagedIdentityCredential will use IMDS
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'http://169.254.169.254/metadata/identity/oauth2/token?api-version=REDACTED&resource=REDACTED'
Request method: 'GET'
Request headers:
    'User-Agent': 'azsdk-python-identity/1.20.0 Python/3.12.0 (Linux-6.12.18-1-lts-x86_64-with-glibc2.41)'
No body was attached to the request


Extracted PDF table data: [[['Work Instruction\nDocument Code: ZF-IT-SUL-\nLaboratory\nDate : 31/7/2020\nVersion: 1', 'Sulfated Analysis by HPLC']], [{'1': '2', 'Puncture the sample. Open the slot to insert the needle and introduce the sample.\nRemove the needle and close the slot.': 'Enter the Sample Name and in Method select the corresponding base for the analysis\nto be performed. Press Inject.'}], [{'3': '4', 'Click on Integrate and Quantitate. Click on Send data to review to visualize the\nresults.': 'Click on the icon, and the window Specify single inject Parameters will appear.'}]]
Loaded workinst embeddings from cache.


INFO:azure.identity._credentials.chained:DefaultAzureCredential acquired a token from AzureCliCredential
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://management.azure.com/subscriptions/b98e9951-860f-464a-a9a2-f69802ca8721/resourceGroups/ai_llm/providers/Microsoft.MachineLearningServices/workspaces/agentic_rag/connections?api-version=REDACTED&category=REDACTED&includeAll=REDACTED'
Request method: 'GET'
Request headers:
    'Accept': 'application/json'
    'x-ms-client-request-id': '24643cbe-00bc-11f0-86ec-38fc98f12ce6'
    'User-Agent': 'azsdk-python-ai-projects/1.0.0b6 Python/3.12.0 (Linux-6.12.18-1-lts-x86_64-with-glibc2.41)'
    'Authorization': 'REDACTED'
No body was attached to the request
INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 200
Response headers:
    'Cache-Control': 'no-cache'
    'Pragma': 'no-cache'
    'Content-Length': '3387'
    'Content-Type': 'application/json; charset=utf-8'
    'Expires': '-1'
    'Vary': '


=== Phase 1: Raw Retrieved Metadata ===
Document 1 Metadata: {'format': 'PDF 1.7', 'title': '', 'author': 'Ricard Torralba', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word for Microsoft 365', 'producer': 'Microsoft® Word for Microsoft 365', 'creationDate': "D:20250204224823+01'00'", 'modDate': "D:20250204224823+01'00'", 'trapped': '', 'encryption': None, 'Document Name': 'Work Instruction - Hypophosphorous Acid', 'Step Number': None, 'Author': 'Ricard Torralba', 'Created Date': "D:20250204224823+01'00'", 'Keywords': [''], 'Process Type': None, 'Safety Measures': [], 'Version': '2', 'Date': '20/06/2020', '_id': '286233f5-dc50-4fe7-b05d-0830e3e8f1de', '_collection_name': 'work_instructions_RAG'}
Content Preview: AG Solution Spain S.A.U. C. Pujades 350, Planta 4, Puerta 1
08019 Barcelona – Spain
www.agsolutiongr...

Document 2 Metadata: {'format': 'PDF 1.7', 'title': '', 'author': 'Ricard Torralba', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word for Microsoft 365'

ERROR:langchain_core.tracers.langchain:'QdrantClient' object has no attribute 'update_run'



=== Phase 3: Post-Reranking Metadata ===
Reranked 1 Metadata: {'format': 'PDF 1.7', 'title': '', 'author': 'Ricard Torralba', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word for Microsoft 365', 'producer': 'Microsoft® Word for Microsoft 365', 'creationDate': "D:20250204224823+01'00'", 'modDate': "D:20250204224823+01'00'", 'trapped': '', 'encryption': None, 'Document Name': 'Work Instruction - Hypophosphorous Acid', 'Step Number': None, 'Author': 'Ricard Torralba', 'Created Date': "D:20250204224823+01'00'", 'Keywords': [''], 'Process Type': None, 'Safety Measures': [], 'Version': '2', 'Date': '20/06/2020', '_id': '286233f5-dc50-4fe7-b05d-0830e3e8f1de', '_collection_name': 'work_instructions_RAG', 'score': 5.0}
Content Preview: AG Solution Spain S.A.U. C. Pujades 350, Planta 4, Puerta 1
08019 Barcelona – Spain
www.agsolutiongr...

Reranked 2 Metadata: {'format': 'PDF 1.7', 'title': '', 'author': 'Ricard Torralba', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word for