In [None]:
FILE_NAME='data/html/sample-html.html'

# HTML Parsing
In this notebook we compare a few popular strategies for turning raw HTML into structured chunks you can feed into downstream pipelines.
1. **BeautifulSoup** – a lightweight parser ideal for quick inspection or simple extraction.
2. **Unstructured.io** – a partitioning library that automatically segments documents into typed elements.
3. Other libraries and frameworks for HTML document parsing.

## BeautifulSoup

**Beautiful Soup** is a Python library designed for quick-turnaround web scraping and screen-scraping projects. Since 2004, it has helped developers extract data from poorly structured HTML and XML documents with minimal code. Key features include:

*   **Simple navigation and search methods** for parsing and modifying document trees
*   **Automatic encoding handling**, converting input to Unicode and output to UTF-8
*   **Parser flexibility**, working with `lxml`, `html5lib`, and Python’s built-in parser
*   **Powerful querying**, like finding links by class, matching URLs, or extracting nested elements

Beautiful Soup is lightweight, easy to integrate, and widely used in both personal and enterprise projects. It’s available via PyPI (`pip install beautifulsoup4`) and supported on Python 3.7+. Licensed under MIT, it’s ideal for developers needing fast, reliable data extraction from messy web pages.

In [None]:
from bs4 import BeautifulSoup

# Parse the HTML file and surface common elements
with open(FILE_NAME, 'r', encoding='utf-8') as f:
    soup = BeautifulSoup(f, 'html.parser')

title = soup.title.get_text(strip=True) if soup.title else 'N/A'
print(f'Document title: {title}\n')

headings = [(tag.name.upper(), tag.get_text(' ', strip=True)) for tag in soup.find_all(['h1', 'h2', 'h3'])]
if headings:
    print('Headings:')
    for level, text in headings:
        print(f' - {level}: {text}')
else:
    print('No headings found.')

links = [
    (link.get_text(' ', strip=True) or '(no text)', link['href'])
    for link in soup.find_all('a', href=True)
    if not link['href'].startswith('#')
 ]
print(f"\nFound {len(links)} link(s):")
for idx, (text, href) in enumerate(links, start=1):
    print(f' {idx}. {text} -> {href}')

paragraphs = [p.get_text(' ', strip=True) for p in soup.find_all('p')]
if paragraphs:
    print(f"\nFirst paragraph snippet: {paragraphs[0][:120]}...")

table = soup.find('table')
if table:
    rows = [
        [cell.get_text(' ', strip=True) for cell in row.find_all(['th', 'td'])]
        for row in table.find_all('tr')
    ]
    print('\nFirst table preview:')
    for row in rows[:3]:
        print(' | '.join(row))
else:
    print('\nNo table detected.')

## unstrunctured.io

**Unstructured.io** is an open-source ETL library designed to convert complex documents—like PDFs, Word files, HTML, and images—into clean, structured data optimized for use with large language models (LLMs). It provides modular components for:

*   **Document ingestion and pre-processing**
*   **Auto-partitioning and format detection**
*   **Table and image enrichment**
*   **Chunking and embedding generation**

Unstructured supports both local development and containerized deployment via Docker. It integrates easily with Python and offers connectors for platforms like Discord. The library is ideal for building scalable, production-grade data pipelines and is available via PyPI and GitHub.

In [None]:

from unstructured.partition.html import partition_html
elements = partition_html(filename=FILE_NAME, infer_table_structure=True)
print(f"Parsed {len(elements)} element(s) from the HTML document.")

In [None]:
from collections import Counter
from textwrap import shorten

type_counts = Counter(element.category for element in elements)
print('Element counts by category:')
for category, count in type_counts.most_common():
    print(f' - {category}: {count}')

print('\nSample elements:')
for i, element in enumerate(elements[:5], start=1):
    if element.category == 'Table':
        chunk_text = element.metadata.text_as_html or ''
    else:
        chunk_text = element.text or ''
    snippet = shorten(chunk_text, width=100, placeholder='…')
    page_number = getattr(element.metadata, 'page_number', None)
    source = getattr(element.metadata, 'filename', None)
    metadata_bits = [f'page {page_number}' if page_number else None, source]
    metadata_str = ' | '.join(bit for bit in metadata_bits if bit) or 'no metadata'
    print(f"{i}. [{element.category}] {snippet}")
    print(f"    metadata: {metadata_str}")