In [1]:
pip install docling


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


=== Streaming Chunks ===

--- Chunk 1 ---
{
  "content": "Chapter 7:",
  "metadata": {
    "type": "text",
    "label": "section_header",
    "prov": {
      "page_no": 1,
      "bbox": {
        "l": 72.02400207519531,
        "t": 714.3099975585938,
        "r": 185.0659942626953,
        "b": 686.3499755859375,
        "coord_origin": "BOTTOMLEFT"
      },
      "charspan": [
        0,
        10
      ]
    },
    "level": 1
  }
}

--- Chunk 2 ---
{
  "content": "In this section we will cover how to handle table structure with complex tables. This will be an overview on the best practices for table structure and how to set headers with different heading levels.",
  "metadata": {
    "type": "text",
    "label": "text",
    "prov": {
      "page_no": 1,
      "bbox": {
        "l": 72.02400207519531,
        "t": 658.510009765625,
        "r": 541.1690063476562,
        "b": 600.4299926757812,
        "coord_origin": "BOTTOMLEFT"
      },
      "charspan": [
        0,
        201


In [12]:
def resolve_ref(ref, root):
    """
    Resolve a local reference string (e.g. "#/texts/0") by walking the root JSON.
    """
    if not ref.startswith("#/"):
        raise ValueError("Only local references are supported: " + ref)
    parts = ref.lstrip("#/").split("/")
    current = root
    for part in parts:
        if isinstance(current, list):
            current = current[int(part)]
        elif isinstance(current, dict):
            current = current.get(part)
        else:
            raise ValueError(f"Cannot resolve part '{part}' in reference '{ref}'")
    return current

def extract_text(node, root):
    """
    Given a node that may be a reference, return its text content.
    Preference is given to the "text" field and then to "orig".
    """
    # If the node is a reference, resolve it.
    if isinstance(node, dict) and "$ref" in node:
        node = resolve_ref(node["$ref"], root)
    # Return "text" or "orig" or an empty string.
    return node.get("text") or node.get("orig") or ""

def iter_chunks(json_data):
    """
    Generator that iterates over the document's content (as defined by "body.children")
    and yields a dictionary for each content chunk with its contextual metadata.
    
    For text nodes, we preserve:
      - The actual text (from "text" or "orig")
      - A label (e.g. "section_header" or "text")
      - Provenance info (from "prov")
      - The heading level (if present)
    
    For table nodes, we extract:
      - A joined string of all non-empty table cell texts from data.table_cells
      - The label ("table")
      - Provenance info (from "prov")
      - And any captions, which are resolved to the actual text.
    """
    body = json_data.get("body", {})
    children = body.get("children", [])
    
    for child in children:
        # Each child is a reference (e.g. {"$ref": "#/texts/0"} or "#/tables/0")
        if "$ref" in child:
            node = resolve_ref(child["$ref"], json_data)
        else:
            node = child
        
        # Process text nodes (from "texts" array)
        if node.get("label") in ["text", "section_header"]:
            chunk = {}
            # Extract the content: prefer "text" over "orig".
            content = node.get("text") or node.get("orig", "")
            chunk["content"] = content
            
            # Build contextual metadata.
            metadata = {"type": "text"}
            if "label" in node:
                metadata["label"] = node["label"]
            if "prov" in node and isinstance(node["prov"], list) and node["prov"]:
                # Capture page number and bbox from the first provenance entry.
                metadata["prov"] = node["prov"][0]
            if "level" in node:
                metadata["level"] = node["level"]
            chunk["metadata"] = metadata
            
            yield chunk
        
        # Process table nodes (from "tables" array)
        elif node.get("label") == "table":
            chunk = {}
            table_data = node.get("data", {})
            cells = table_data.get("table_cells", [])
            # Join the text from each cell (skip empty ones)
            cell_texts = [cell.get("text", "").strip() for cell in cells if cell.get("text", "").strip()]
            table_content = " | ".join(cell_texts)
            chunk["content"] = table_content
            
            # Build contextual metadata for tables.
            metadata = {"type": "table", "label": "table"}
            if "prov" in node and isinstance(node["prov"], list) and node["prov"]:
                metadata["prov"] = node["prov"][0]
            
            # Process captions: resolve each caption if it is a reference.
            raw_captions = node.get("captions", [])
            resolved_captions = []
            for cap in raw_captions:
                caption_text = extract_text(cap, json_data)
                if caption_text:
                    resolved_captions.append(caption_text)
            if resolved_captions:
                metadata["captions"] = resolved_captions
            chunk["metadata"] = metadata
            
            yield chunk
        
        # Additional node types can be handled here if needed.
        else:
            continue


In [13]:
# --- Example Usage ---

def flatten_json(data):
    # (This sample JSON is a simplified version based on your example.)
  

    print("=== Streaming Chunks ===")
    # Process each chunk iteratively without building a giant in-memory flat structure.
    for idx, chunk in enumerate(iter_chunks(data), start=1):
        print(f"\n--- Chunk {idx} ---")
        print(json.dumps(chunk, indent=2))

In [14]:
from docling.document_converter import DocumentConverter
import pprint
source = "files/docling tech report.pdf"  # PDF path or URL
converter = DocumentConverter()
result = converter.convert(source)
data = result.document.export_to_dict()
flatten_json(data)
pprint.pprint(f"dict data = {data}")  # output: "### Docling Technical Report[...]"

=== Streaming Chunks ===

--- Chunk 1 ---
{
  "content": "Docling Technical Report",
  "metadata": {
    "type": "text",
    "label": "section_header",
    "prov": {
      "page_no": 1,
      "bbox": {
        "l": 212.58999633789062,
        "t": 566.6400146484375,
        "r": 399.4119873046875,
        "b": 551.1630249023438,
        "coord_origin": "BOTTOMLEFT"
      },
      "charspan": [
        0,
        24
      ]
    },
    "level": 1
  }
}

--- Chunk 2 ---
{
  "content": "Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar",
  "metadata": {
    "type": "text",
    "label": "text",
    "prov": {
      "page_no": 1,
      "bbox": {
        "l": 113.64299774169922,
        "t": 481.5320129394531,
        "r": 498.3590087890625,
        "b