# PDF Parsing and Chunking
This notebook builds on the parsing workflow from `09.ReadDocument/03.pdf-reader.ipynb` and extends it with chunking strategies you can reuse in downstream retrieval pipelines.
1. **Extraction helpers** – local (`PyPDF2`) and remote (Azure Document Intelligence) utilities for turning PDFs into Markdown and plain text.
2. **Cleaning & inspection** – shared helpers for token counting and quick chunk inspection.
3. **Chunking recipes** – header-based, recursive character, Unstructured, and Semchunk examples for experimenting with sectioning strategies.

In [None]:
%pip install logging tiktoken azure-ai-documentintelligence azure-core azure-identity PyPDF2 python-dotenv langchain langchain-text-splitters "unstructured[md]" semchunk markdown

In [1]:
FILE_NAME = "data/pdf/sample-pdf.pdf"

In [None]:
import logging
import os
import re
from dataclasses import dataclass
from io import BytesIO
from typing import Iterable, Sequence

import semchunk
import tiktoken
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from unstructured.chunking.title import chunk_by_title
from unstructured.partition.md import partition_md

load_dotenv()
encoding = tiktoken.get_encoding("cl100k_base")


@dataclass(frozen=True)
class ChunkingConfig:
    markdown_headers: tuple[tuple[str, str], ...] = (
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ("####", "Header 4"),
        ("#####", "Header 5"),
        ("######", "Header 6"),
        ("#######", "Header 7"),
        ("########", "Header 8"),
    )
    chunk_size: int = 750
    chunk_overlap: int = 50
    recursive_separators: tuple[str, ...] = (".", "!", "\n\n", "\n", " ", "")
    semchunk_model: str = "gpt-4o-mini"


CONFIG = ChunkingConfig(
    semchunk_model=os.getenv("SEMCHUNK_MODEL", "gpt-4o-mini"),
)

if not logging.getLogger().handlers:
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
        handlers=[
            logging.FileHandler("debug.log"),
            logging.StreamHandler(),
        ],
    )

logger = logging.getLogger(__name__)


def build_markdown_header_splitter(config: ChunkingConfig) -> MarkdownHeaderTextSplitter:
    """Return a Markdown header splitter configured with the provided defaults."""
    return MarkdownHeaderTextSplitter(
        headers_to_split_on=config.markdown_headers,
        strip_headers=False,
    )


def build_recursive_splitter(config: ChunkingConfig) -> RecursiveCharacterTextSplitter:
    """Return a recursive character splitter for secondary chunking passes."""
    return RecursiveCharacterTextSplitter(
        chunk_size=config.chunk_size,
        chunk_overlap=config.chunk_overlap,
        separators=list(config.recursive_separators),
    )


def num_tokens_from_string(text: str) -> int:
    """Calculate token length using the configured tokenizer."""
    return len(encoding.encode(text))


def print_chunks_page_content(page_content: Iterable) -> None:
    """Print basic statistics and content for each chunk."""
    chunks = list(page_content)
    print(f"Number of chunks: {len(chunks)}")
    short_str=""
    long_str=""
    for index, chunk in enumerate(chunks, start=1):
        body = getattr(chunk, "page_content", str(chunk))
        summary = (
            f"\n\nChunk {index} character count: {len(body)} "
            f"token number: {num_tokens_from_string(body)}\n"
        )
        short_str+=summary
        long_str+=summary + body
    print(short_str)
    print()
    print(long_str)


def print_text_chunks(chunks: Sequence[str]) -> None:
    """Print chunk statistics for raw text sequences."""
    print(f"Number of chunks: {len(chunks)}")
    for index, chunk in enumerate(chunks, start=1):
        summary = (
            f"Chunk {index} character count: {len(chunk)} "
            f"token number: {num_tokens_from_string(chunk)}"
        )
        print(summary)
    print()
    for index, chunk in enumerate(chunks, start=1):
        summary = (
            f"Chunk {index} character count: {len(chunk)} "
            f"token number: {num_tokens_from_string(chunk)}"
        )
        print(summary)
        print(chunk)
        print()


def handle_pdf_locally(uploaded_file: BytesIO, clean: bool = False) -> str:
    """Extract text from a PDF using PyPDF2."""
    logger.info("Processing document locally")
    try:
        uploaded_file.seek(0)
        pdf_reader = PdfReader(uploaded_file)
        texts = [(page.extract_text() or "") for page in pdf_reader.pages]
        output = "\n".join(texts)
        return clean_text(output) if clean else output
    except Exception:
        logger.exception("Error processing document locally")
        raise


def handle_pdf_remotely(uploaded_file: BytesIO, clean: bool = False) -> str:
    """Extract text from a PDF using Azure Document Intelligence."""
    logger.info("Processing PDF document remotely")
    doc_intelligence_endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
    doc_intelligence_key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY")

    if not doc_intelligence_endpoint or not doc_intelligence_key:
        raise EnvironmentError("Azure Document Intelligence configuration is missing.")

    document_intelligence_client = DocumentIntelligenceClient(
        endpoint=doc_intelligence_endpoint,
        credential=AzureKeyCredential(doc_intelligence_key),
    )
    try:
        uploaded_file.seek(0)
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-layout",
            body=uploaded_file,
            content_type="application/octet-stream",
            output_content_format="markdown",
        )
        result: AnalyzeResult = poller.result()
        return clean_text(result.content) if clean else result.content
    except Exception:
        logger.exception("Error processing PDF document remotely")
        raise


def read_file_bin(file_name: str) -> BytesIO:
    """Read a file from disk and return its binary contents."""
    logger.info("Reading file %s", file_name)
    try:
        with open(file_name, "rb") as file:
            return BytesIO(file.read())
    except FileNotFoundError:
        logger.exception("The file %s does not exist", file_name)
        raise


def save_file(file_name: str, data: str) -> None:
    """Write text data to disk using UTF-8 encoding."""
    logger.info("Saving file %s", file_name)
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(data)


def clean_text(
    text: str,
    remove_comments: bool = False,
    put_html_tables_on_new_line: bool = True,
    preserve_linebreaks: bool = False,
    ) -> str:
    """Remove redundant whitespace and optionally strip HTML comments."""
    logger.info("Cleaning text")
    text = re.sub(
        '(?<=<table>)(.*?)(?=</table>)',
        lambda match: match.group(0).replace('\n', ' '),
        text,
        flags=re.DOTALL,
    )
    patterns = {
        '\n+': '\n',
        ' +': ' ',
        r'\s<': '<',
        r'>\s': '>',
        r'\s\.': '.',
        r'\s,': ',',
        r'\s!': '!',
        r'\s\?': '?',
        r'\s:': ':',
        r'\s;': ';',
        r'\s\)': ')',
        r'\(\s': '(',
        r'\[\s': '[',
        r'\s\]': ']',
        r'\s\}': '}',
        r'\}\s': '}',
    }
    for pattern, replacement in patterns.items():
        text = re.sub(pattern, replacement, text)
    if put_html_tables_on_new_line:
        text = text.replace('<table>', '\n<table>')
    if preserve_linebreaks:
        text = text.replace('\n', '\n\n')
    if remove_comments:
        text = re.sub(r'<!--(.*?)-->', '', text, flags=re.DOTALL)
    return text

# Generate Markdown 
The following section captures both Azure Document Intelligence output.

In [3]:
file = read_file_bin(FILE_NAME)
md_file = handle_pdf_remotely(BytesIO(file.getvalue()))


2025-10-28 13:42:14,320 [INFO] Reading file data/pdf/sample-pdf.pdf
2025-10-28 13:42:14,326 [INFO] Processing PDF document remotely
2025-10-28 13:42:14,334 [INFO] Request URL: 'https://docinteli-we-paid.cognitiveservices.azure.com//documentintelligence/documentModels/prebuilt-layout:analyze?api-version=REDACTED&outputContentFormat=REDACTED'
Request method: 'POST'
Request headers:
    'content-type': 'application/octet-stream'
    'Accept': 'application/json'
    'x-ms-client-request-id': '87b3a284-b3fb-11f0-bee1-f42679b81856'
    'User-Agent': 'azsdk-python-ai-documentintelligence/1.0.2 Python/3.11.9 (Windows-10-10.0.26200-SP0)'
    'Ocp-Apim-Subscription-Key': 'REDACTED'
A body is sent with the request
2025-10-28 13:42:15,689 [INFO] Response status: 202
Response headers:
    'Content-Length': '0'
    'Operation-Location': 'REDACTED'
    'x-envoy-upstream-service-time': 'REDACTED'
    'apim-request-id': 'REDACTED'
    'Strict-Transport-Security': 'REDACTED'
    'x-content-type-options'

# Segment by Markdown Headers
Start by segmenting the Azure DI markdown using `MarkdownHeaderTextSplitter`. This preserves document hierarchy, giving each chunk structural metadata that is useful for retrieval augmentation and analytics.
For more background on the splitter, refer to [How to split Markdown by Headers](https://python.langchain.com/docs/how_to/markdown_header_metadata_splitter/).

In [4]:
print(md_file)

<!-- PageHeader="Surface Pro 8 Fact Sheet October 2021" -->


<figure>

\-

TH

\-

YA

\-

HOLDORCS

</figure>


# Meet Surface Pro 8, designed for a Pro like you.

Unlock more possibilities than ever with Surface Pro 8. Designed to light up the best of Windows 11,
Surface Pro 8 combines the power of a laptop with the flexibility of a tablet, and every angle in between,
with the iconic Kickstand and detachable Keyboard* with built-in Slim Pen storage and charging.1 Do
great things with a larger 13" touchscreen, faster connections with Thunderbolt™ 4 ports, and extra speed
when you need it.


## Top Features and Benefits

. Our most powerful Pro ever. Surface Pro 8 is over 2 times faster than Surface Pro 7. Delivering
over 40% higher sustained CPU performance and 74% faster graphics sustained performance,
Surface Pro 8 will handle it all ...

· Extend the ultimate desktop experience with Thunderbolt™ 4 ports. Create your ultimate
productivity setup with multiple 4K monitors, keep large

In [5]:
md_text_splitter = build_markdown_header_splitter(CONFIG)
md_header_splits = md_text_splitter.split_text(md_file)

print(f"Length of splits: {len(md_header_splits)}")

Length of splits: 6


# Inspect Header-Based Chunks
Review the chunk inventory to confirm that the header grouping aligns with the logical sections of the source PDF. Spot-checking early prevents redundant downstream clean-up.

In [6]:
print_chunks_page_content(md_header_splits)

Number of chunks: 6
Chunk 1 character count: 118 token number: 39
Chunk 2 character count: 484 token number: 111
Chunk 3 character count: 1809 token number: 406
Chunk 4 character count: 4225 token number: 1406
Chunk 5 character count: 135 token number: 35
Chunk 6 character count: 2865 token number: 598


Chunk 1 character count: 118 token number: 39
<!-- PageHeader="Surface Pro 8 Fact Sheet October 2021" -->  
<figure>  
\-  
TH  
\-  
YA  
\-  
HOLDORCS  
</figure>Chunk 2 character count: 484 token number: 111
# Meet Surface Pro 8, designed for a Pro like you.  
Unlock more possibilities than ever with Surface Pro 8. Designed to light up the best of Windows 11,
Surface Pro 8 combines the power of a laptop with the flexibility of a tablet, and every angle in between,
with the iconic Kickstand and detachable Keyboard* with built-in Slim Pen storage and charging.1 Do
great things with a larger 13" touchscreen, faster connections with Thunderbolt™ 4 ports, and extra speed
when you need it

# Examine Chunk Metadata
Each `Document` produced by LangChain stores both the text fragment and its metadata (header lineage, source path, etc.). Inspecting the serialized view demonstrates which attributes you can surface in search indices or analytics dashboards.

In [None]:
print(md_header_splits[2].model_dump_json())


# Refine Granularity
If the header-level chunks are still too large for your target context window, layer an additional splitter to produce evenly sized passages.

## Split with Recursive Character Strategy
`RecursiveCharacterTextSplitter` walks through a prioritized separator list to keep sentences intact whenever possible. This yields compact, semantically coherent chunks that sit within the desired token budget.
Reference documentation: https://python.langchain.com/docs/how_to/recursive_text_splitter/.

In [None]:
rct_text_splitter = build_recursive_splitter(CONFIG)

splits = rct_text_splitter.split_documents(md_header_splits)

print_chunks_page_content(splits)

## Normalize Content Prior to Chunking
Run a lightweight clean-up pass to remove redundant whitespace, inline HTML noise, and comments. Cleaner input produces more predictable chunk boundaries and reduces token waste.

In [None]:
md_file_clean = clean_text(md_file,remove_comments=True)
print(md_file_clean)
save_file(FILE_NAME.replace(".pdf","-clean.md"),md_file_clean)

In [None]:
md_header_splits_clean = md_text_splitter.split_text(md_file_clean)
print("Length of splits: " + str(len(md_header_splits_clean)))

In [None]:
print_chunks_page_content(md_header_splits_clean)

In [None]:
rct_text_splitter = build_recursive_splitter(CONFIG)

splits = rct_text_splitter.split_documents(md_header_splits_clean)

print_chunks_page_content(splits)

# Compare Alternative Chunkers
Different workloads benefit from different chunking heuristics. The following sections illustrate how third-party libraries can complement LangChain's native splitters.

## Unstructured Library
`unstructured` offers layout-aware chunking tuned for reports and slide decks. Its title-based splitter keeps related paragraphs together while respecting configurable length limits.

In [None]:
NEW_AFTER_N_CHARS = CONFIG.chunk_size
MAX_CHARACTERS = CONFIG.chunk_size
COMBINE_UNDER_N_CHARS = CONFIG.chunk_overlap

elements = partition_md(text=md_file_clean)
print(f"Number of elements: {len(elements)}")

chunks = chunk_by_title(
    elements,
    multipage_sections=True,
    max_characters=MAX_CHARACTERS,
    new_after_n_chars=NEW_AFTER_N_CHARS,
    combine_text_under_n_chars=COMBINE_UNDER_N_CHARS,
    )

print(f"Number of chunks: {len(chunks)}")
for index, chunk in enumerate(chunks, start=1):
    chunk_text = chunk.metadata.text_as_html if chunk.category == "Table" else chunk.text or ""
    summary = (
        f"Chunk {index} ({chunk.category}) length={len(chunk_text)} "
        f"token_count={num_tokens_from_string(chunk_text)}"
    )
    print(summary)

## Semchunk
`semchunk` applies model-assisted segmentation to preserve semantic continuity. Configure it with your target model and chunk size to prototype LLM-guided chunk boundaries.

In [None]:
semchunk_chunker = semchunk.chunkerify(CONFIG.semchunk_model, CONFIG.chunk_size)
semchunk_chunks = semchunk_chunker(md_file_clean)

print_text_chunks(semchunk_chunks)

# Next Steps
- Validate that Azure credentials are configured before invoking the remote parser.
- Experiment with alternative `chunk_size` and `chunk_overlap` settings to match your embedding model context window.
- Persist sample outputs (`.md`, `.txt`, and chunk JSON) so you can benchmark retrieval quality across chunking strategies.