# Skills: Apple Human Interface Guidelines
ðŸ”— url: https://developer.apple.com/design/human-interface-guidelines

Abbreviations:
- Human Interface Guidelines (HIG)

## Setup

In [32]:
import json
import os
import time
from pathlib import Path
from typing import Any
from urllib.parse import urlparse


import requests
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from playwright.async_api import async_playwright

In [5]:
data_dir = Path("./data/apple_hig")
data_dir.mkdir(exist_ok=True, parents=True)
output_dir = Path("../skills/apple_hig")
output_dir.mkdir(exist_ok=True, parents=True)

## Scrape Apple HIG

### Extract URLs to all pages with structure

In [3]:
async def get_hig_pages(
    base_url: str = "https://developer.apple.com/design/human-interface-guidelines",
) -> dict[str, Any]:
    """Crawl Apple HIG and return a structured map of all pages."""

    async with async_playwright() as pw:
        browser = await pw.chromium.launch(headless=True)
        # context = await browser.new_context(viewport={"width": 1920, "height": 1080})
        # page = await context.new_page()
        page = await browser.new_page()

        await page.goto(base_url, wait_until="networkidle")
        await page.wait_for_selector(".navigator-card-item")

        origin = f"{urlparse(page.url).scheme}://{urlparse(page.url).netloc}"

        # Step 1: Get 6 main sections from sidebar navigator
        navigator = page.locator(".navigator")
        items = await navigator.locator(".navigator-card-item:has(.tree-toggle)").all()

        sections = []
        for item in items:
            link = item.locator(".leaf-link")
            name = (await link.text_content()).strip()
            url = f"{origin}{await link.get_attribute('href')}"
            sections.append({"name": name, "url": url})

        print(f"Found {len(sections)} sections: {[s['name'] for s in sections]}\n")

        async def get_card_links(url: str) -> list[dict]:
            """Helper: Navigate to a URL and return all HIG links from its card grid."""
            await page.goto(url, wait_until="networkidle")
            await page.wait_for_selector(".TopicsLinkCardGrid")

            card_grid = page.locator(".TopicsLinkCardGrid")

            if await card_grid.count() == 0:
                return []

            links = await card_grid.locator(
                "a[href*='design/human-interface-guidelines']"
            ).all()

            results = []
            for link in links:
                name = (await link.text_content()).strip()
                href = await link.get_attribute("href")
                results.append({"name": name, "url": f"{origin}{href}"})

            return results

        async def is_subsection(url: str) -> bool:
            """Helper: Navigate to a URL and check if it's a sub-section listing page."""
            await page.goto(url, wait_until="networkidle")
            await page.wait_for_selector(".doc-content")
            h2_count = await page.locator(".doc-content h2").count()
            return h2_count == 0

        # Step 2: For each section, get children and classify them
        hig_structure = {}

        for section in sections:
            section_slug = section["url"].rstrip("/").split("/")[-1]
            print(f"Section: {section['name']}")

            children = await get_card_links(section["url"])
            entry = {"url": section["url"], "pages": [], "subsections": {}}

            for child in children:
                child_slug = child["url"].rstrip("/").split("/")[-1]

                if await is_subsection(child["url"]):
                    # Step 2.1: Sub-section found, get its pages
                    sub_pages = await get_card_links(child["url"])
                    entry["subsections"][child_slug] = {
                        "url": child["url"],
                        "pages": [p["url"] for p in sub_pages],
                    }
                else:
                    entry["pages"].append(child["url"])

            total = len(entry["pages"]) + sum(
                len(s["pages"]) for s in entry["subsections"].values()
            )
            print(
                f"  -> {len(entry['pages'])} pages, "
                f"{len(entry['subsections'])} sub-sections, "
                f"{total} total\n"
            )
            hig_structure[section_slug] = entry

        await browser.close()

    return hig_structure

In [4]:
hig_structure = await get_hig_pages()

Found 6 sections: ['Getting started', 'Foundations', 'Patterns', 'Components', 'Inputs', 'Technologies']

Section: Getting started
  -> 7 pages, 0 sub-sections, 7 total

Section: Foundations
  -> 18 pages, 0 sub-sections, 18 total

Section: Patterns
  -> 25 pages, 0 sub-sections, 25 total

Section: Components
  -> 0 pages, 8 sub-sections, 63 total

Section: Inputs
  -> 13 pages, 0 sub-sections, 13 total

Section: Technologies
  -> 29 pages, 0 sub-sections, 29 total



In [6]:
hig_structure_path = data_dir / "hig_structure.json"
with open(hig_structure_path, "w") as f:
    json.dump(hig_structure, f, indent=4)

### Scrape content

In [31]:
def flatten_hig_structure(hig_structure: dict[str, Any]) -> list[str]:
    all_pages = []
    for section in hig_structure.values():
        all_pages.extend(section["pages"])

        for sub in section["subsections"].values():
            all_pages.extend(sub["pages"])

    return all_pages

In [None]:
async def scrape_hig_content(hig_structure: dict[str, Any]) -> list[dict]:
    """Scrape markdown content from all HIG pages."""

    all_pages = flatten_hig_structure(hig_structure)

    print(f"Scraping {len(all_pages)} pages...\n")

    async with async_playwright() as pw:
        browser = await pw.chromium.launch(headless=True)
        page = await browser.new_page()

        results = {}

        for url in tqdm(all_pages, desc="Scraping pages", unit="page"):
            try:
                await page.goto(url, wait_until="networkidle")
                await page.wait_for_selector(".doc-content")

                title = (await page.locator("h1").first.text_content()).strip()
                abstract = ""
                abstract_loc = page.locator(".abstract")
                if await abstract_loc.count() > 0:
                    abstract = (await abstract_loc.first.text_content()).strip()

                content_html = await page.locator(".doc-content").inner_html()
                markdown_content = md(content_html, heading_style="ATX")
                full_content = (
                    f"# {title}\n\n{abstract}\n\n{markdown_content}"
                    if abstract
                    else f"# {title}\n\n{markdown_content}"
                )

                results[url] = full_content

            except Exception as e:
                print(f"Error scraping {url}: {e}")
                continue

        await page.close()
        await browser.close()

    return results

In [35]:
scraped_pages = await scrape_hig_content(hig_structure)

Scraping 155 pages...



Scraping pages:   0%|          | 0/155 [00:00<?, ?page/s]

Future exception was never retrieved
future: <Future finished exception=TargetClosedError('Target page, context or browser has been closed')>
playwright._impl._errors.TargetClosedError: Target page, context or browser has been closed


In [38]:
scraped_pages_path = data_dir / "scraped_pages.json"
with open(scraped_pages_path, "w") as f:
    json.dump(scraped_pages, f, indent=4)