In [71]:
import torch

from transformers import pipeline
from docling.chunking import HybridChunker
from docling.document_converter import DocumentConverter

In [72]:
# doc = DocumentConverter().convert(source="../artifacts/1/HIVE-COTE 2.0.pdf-3f1825be-ee35-4d60-996a-55bb4a8b9c07.pdf").document

doc = DocumentConverter().convert(source="../artifacts/1/test.docx").document

chunker = HybridChunker(
    tokenizer="sentence-transformers/all-MiniLM-L6-v2",
    max_tokens=800,
    overlap_tokens=200
)

chunks = list(chunker.chunk(doc))

2025-09-26 15:52:43,995 - INFO - detected formats: [<InputFormat.DOCX: 'docx'>]
2025-09-26 15:52:43,998 - INFO - Going to convert document batch...
2025-09-26 15:52:43,998 - INFO - Initializing pipeline for SimplePipeline with options hash 995a146ad601044538e6a923bea22f4e
2025-09-26 15:52:43,999 - INFO - Processing document test.docx
2025-09-26 15:52:44,071 - INFO - Finished converting document test.docx in 0.08 sec.
Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors


In [73]:
from typing import List, Any, Optional
from pydantic import BaseModel
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")

class Node(BaseModel):
    title: str
    level: int
    children: List["Node"] = []
    doc_items: List[Any] = []
    embeddings: Optional[Any] = None

Node.model_rebuild()

tree_root = Node(title="Full Text", level=0)
for chunk in chunks:
    chunk_headings = chunk.meta.headings or []
    parent_node = tree_root

    for heading in chunk_headings:
        existing_node = next((n for n in parent_node.children if n.title == heading), None)
        if not existing_node:
            node = Node(title=heading, level=1)
            parent_node.children.append(node)
            parent_node = node
        else:
            parent_node = existing_node
    
    parent_node.doc_items.extend(chunk.meta.doc_items)

def embed_tree(node: Node):
    if node.doc_items:
        full_text = " ".join(getattr(item, "content", getattr(item, "text", "")) for item in node.doc_items)
        if full_text.strip():
            node.embeddings = embeddings_model.embed_query(full_text)
    for child in node.children:
        embed_tree(child)

embed_tree(tree_root)

In [None]:
from typing import Any, List
from pydantic import BaseModel

class Node(BaseModel):
    index: str
    text: str
    embedding: Any = None
    children: List["Node"] = []
    meta: Any = None

Node.model_rebuild()

node_dict = {}
for chunk in chunks:
    for item in chunk.meta.doc_items:
        
    self_ref = chunk.meta.doc_items[0].self_ref
    text = getattr(chunk, "text", "")
    node_dict[self_ref] = Node(index=self_ref, text=text, meta=chunk.meta)

In [None]:
for chunk in chunks:
    self_ref = chunk.meta.doc_items[0].self_ref
    parent_ref = chunk.meta.doc_items[1].parent.cref
    if parent_ref and parent_ref in node_dict:
        parent_node = node_dict[parent_ref]
        child_node = node_dict[item.self_ref]
        parent_node.children.append(child_node)

In [41]:

root_nodes = [node_dict[item.self_ref] for chunk in chunks for item in chunk.meta.doc_items
              if not item.parent or item.parent.cref not in node_dict]


KeyError: '#/texts/4'