In [None]:
from pathlib import Path

FILE_NAME = Path("data/html/sample-html.html")
if not FILE_NAME.exists():
    raise FileNotFoundError(f"Expected to find {FILE_NAME} before running the notebook")


## unstructured.io 

[Unstructured](https://github.com/Unstructured-IO/unstructured) is an open-source ETL toolkit that turns messy business documents—PDFs, Word files, HTML, even slide decks—into structured elements that downstream apps can understand. The library is modular, so you can:

- Detect layout, clean markup, and normalize text with a single pipeline.
- Partition documents into typed elements (titles, narrative text, tables, images, etc.).
- Chunk content intelligently using document context such as headings or tables.

In this notebook we focus on **chunking an HTML document**. The workflow looks like this:

1. Partition the document into semantic elements.
2. Chunk the elements by title so related bullet points stay together.


In [None]:
%pip install --quiet "unstructured[html]"  # Skip if the dependency is already satisfied


In [None]:
from textwrap import shorten
from unstructured.partition.html import partition_html

elements = partition_html(filename=str(FILE_NAME))
print(f"Loaded {len(elements)} elements from {FILE_NAME.name}")


def _element_text(element):
    if element.text:
        return element.text
    if getattr(element.metadata, "text_as_html", None):
        return element.metadata.text_as_html
    return ""


for idx, element in enumerate(elements):
    element_text = _element_text(element)
    preview = shorten(element_text.replace("\n", " ").strip(), width=110, placeholder="...")
    print(f"[{idx:>2}] {element.category:<12} ({len(element_text)} chars) {preview}")


In [None]:
from collections import defaultdict

sections = defaultdict(list)
current_title = "Untitled Section"

for element in elements:
    element_text = _element_text(element).strip()
    if not element_text:
        continue
    if element.category == "Title":
        current_title = element.text.strip() or current_title
        continue
    sections[current_title].append((element.category, element_text))

print(f"Detected {len(sections)} content sections with narrative text:")
for title, content in sections.items():
    section_preview = shorten(" ".join(segment for _, segment in content).replace("\n", " "), width=120, placeholder="...")
    print(f"- {title}: {section_preview}")


In [None]:
MAX_CHARACTERS = 1200
NEW_AFTER_N_CHARS = 1600
COMBINE_UNDER_N_CHARS = 150

from unstructured.chunking.title import chunk_by_title

chunks = chunk_by_title(
    elements,
    multipage_sections=True,
    max_characters=MAX_CHARACTERS,
    new_after_n_chars=NEW_AFTER_N_CHARS,
    combine_text_under_n_chars=COMBINE_UNDER_N_CHARS,
)

chunk_records = []
sum_chars = 0
for idx, chunk in enumerate(chunks):
    raw_text = chunk.text or getattr(chunk.metadata, "text_as_html", "") or ""
    preview = shorten(raw_text.replace("\n", " "), width=140, placeholder="...")
    print(f"Chunk {idx:02d} [{chunk.category} | {len(raw_text)} chars]: {preview}")
    sum_chars += len(raw_text)

    chunk_records.append(
        {
            "idx": idx,
            "category": chunk.category,
            "chars": len(raw_text),
            "preview": preview,
        }
    )

print(f"\nCreated {len(chunk_records)} chunks (max {MAX_CHARACTERS} chars each).")
print(f"Average characters per chunk: {sum_chars / len(chunk_records):.2f}")
print(f"Total characters across all chunks: {sum_chars}")
