# PDF Parsing
This notebook walks through practical ways to transform raw PDF files into structured text that downstream pipelines can consume.
1. **PyPDF2** – lightweight local extraction for quickly reading text from each page.
2. **Azure Document Intelligence** – cloud OCR + layout service that returns markdown with tables, figures, and structure preserved.
3. **Utility helpers** – shared helpers for cleaning text, counting tokens, chunk inspection, and saving intermediate outputs.

In [None]:
%pip install logging tiktoken azure-ai-documentintelligence azure-core azure-identity PyPDF2 python-dotenv

In [None]:
FILE_NAME = "data/pdf/sample-pdf.pdf"

In [None]:
import logging
import os
import re
from io import BytesIO
from typing import Iterable

import tiktoken
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult

load_dotenv()
encoding = tiktoken.get_encoding("cl100k_base")

if not logging.getLogger().handlers:
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(message)s",
        handlers=[
            logging.FileHandler("debug.log"),
            logging.StreamHandler()
        ]
    )

logger = logging.getLogger(__name__)

def handle_pdf_locally(uploaded_file: BytesIO, clean: bool = False) -> str:
    """Extract text from a PDF using PyPDF2."""
    logger.info("Processing document locally")
    try:
        uploaded_file.seek(0)
        pdf_reader = PdfReader(uploaded_file)
        texts = [(page.extract_text() or "") for page in pdf_reader.pages]
        output = "\n".join(texts)
        return clean_text(output) if clean else output
    except Exception:
        logger.exception("Error processing document locally")
        raise

def handle_pdf_remotely(uploaded_file: BytesIO, clean: bool = False) -> str:
    """Extract text from a PDF using Azure Document Intelligence."""
    logger.info("Processing PDF document remotely")
    doc_intelligence_endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
    doc_intelligence_key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ADMIN_KEY")

    if not doc_intelligence_endpoint or not doc_intelligence_key:
        raise EnvironmentError("Azure Document Intelligence configuration is missing.")

    document_intelligence_client = DocumentIntelligenceClient(
        endpoint=doc_intelligence_endpoint,
        credential=AzureKeyCredential(doc_intelligence_key)
    )
    try:
        uploaded_file.seek(0)
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-layout",
            body=uploaded_file,
            content_type="application/octet-stream",
            output_content_format="markdown"
        )
        result: AnalyzeResult = poller.result()
        return clean_text(result.content) if clean else result.content
    except Exception:
        logger.exception("Error processing PDF document remotely")
        raise

def read_file_bin(file_name: str) -> BytesIO:
    """Read a file from disk and return its binary contents."""
    logger.info("Reading file %s", file_name)
    try:
        with open(file_name, "rb") as file:
            return BytesIO(file.read())
    except FileNotFoundError:
        logger.exception("The file %s does not exist", file_name)
        raise

def save_file(file_name: str, data: str) -> None:
    """Write text data to disk using UTF-8 encoding."""
    logger.info("Saving file %s", file_name)
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(data)

def num_tokens_from_string(text: str) -> int:
    """Calculate token length using the configured tokenizer."""
    return len(encoding.encode(text))

def print_chunks_page_content(page_content: Iterable) -> None:
    """Print basic statistics and content for each chunk."""
    chunks = list(page_content)
    print(f"Number of chunks: {len(chunks)}")
    for index, chunk in enumerate(chunks, start=1):
        body = getattr(chunk, "page_content", str(chunk))
        print(
            f"Chunk {index} character count: {len(body)} token number: {num_tokens_from_string(body)}"
        )
        print(body)
        print()

def clean_text(
    text: str,
    remove_comments: bool = False,
    put_html_tables_on_new_line: bool = True,
) -> str:
    """Remove redundant whitespace and optionally strip HTML comments."""
    logger.info("Cleaning text")
    text = re.sub(
        '(?<=<table>)(.*?)(?=</table>)',
        lambda match: match.group(0).replace('\n', ' '),
        text,
        flags=re.DOTALL,
    )
    patterns = {
        '\n+': '\n',
        ' +': ' ',
        r'\s<': '<',
        r'>\s': '>',
        r'\s\.': '.',
        r'\s,': ',',
        r'\s!': '!',
        r'\s\?': '?',
        r'\s:': ':',
        r'\s;': ';',
        r'\s\)': ')',
        r'\(\s': '(',
        r'\[\s': '[',
        r'\s\]': ']',
        r'\s\}': '}',
        r'\}\s': '}',
    }
    for pattern, replacement in patterns.items():
        text = re.sub(pattern, replacement, text)
    if put_html_tables_on_new_line:
        text = text.replace('<table>', '\n<table>')
    if remove_comments:
        text = re.sub(r'<!--(.*?)-->', '', text, flags=re.DOTALL)
    return text

# Proces document in DI and by PyPDF2

In [None]:
file=read_file_bin(FILE_NAME)
md_file=handle_pdf_remotely(file)
txt_file=handle_pdf_locally(file)

save_file(FILE_NAME.replace(".pdf",".md"),md_file)
save_file(FILE_NAME.replace(".pdf",".txt"),txt_file)