Download & concat spring documentation websites

In [1]:
import requests
import os
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from markdownify import markdownify as md
import time

# Selectors provided
NAV_SELECTOR = "a.nav-link"
CONTENT_SELECTOR = "article.doc"

def get_soup(url):
    """Fetches a URL and returns a BeautifulSoup object."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def extract_nav_links(base_url, soup):
    """Extracts all navigation links using the specific selector."""
    links = []
    # select all 'a' tags with class 'nav-link'
    nav_elements = soup.select(NAV_SELECTOR)

    for nav in nav_elements:
        href = nav.get('href')
        if href:
            # Handle relative URLs by joining them with the base URL
            full_url = urljoin(base_url, href)
            # Remove anchor fragments (e.g., #section-1) to avoid duplicates
            clean_url = full_url.split('#')[0]
            if clean_url not in links:
                links.append(clean_url)

    return links

def extract_content(soup):
    """Extracts content from article.doc and converts to Markdown."""
    content_div = soup.select_one(CONTENT_SELECTOR)

    if content_div:
        # specific cleaning: remove permalinks or navigation buttons if necessary
        # Convert HTML to Markdown
        return md(str(content_div), heading_style="ATX")
    else:
        return None

def process_website(base_url, output_file):
    if os.path.exists(output_file):
        print(f"WARNING: The file '{output_file}' already exists.")
        return

    main_soup = get_soup(base_url)

    if not main_soup:
        raise Exception("Error fetching main page. Check BASE_URL.")
    # 2. Extract all page links

    page_links = extract_nav_links(base_url, main_soup)
    print(f"Found {len(page_links)} pages to process.")

    # 3. Iterate through links and append content to output_file
    with open(output_file, "w", encoding="utf-8") as f:
        # Add a title to the document
        f.write(f"# Spring Boot Documentation\n\nSource: {base_url}\n\n---\n\n")

        for i, link in enumerate(page_links):
            print(f"Processing ({i+1}/{len(page_links)}): {link}")

            page_soup = get_soup(link)
            if page_soup:
                markdown_content = extract_content(page_soup)

                if markdown_content:
                    f.write(markdown_content)
                    f.write("\n\n---\n\n") # Separator between pages
                else:
                    print(f"Warning: No content found for selector '{CONTENT_SELECTOR}' on {link}")

            # Be polite to the server
            time.sleep(0.5)

    print(f"\nDone! Documentation saved to {output_file}")

In [2]:
urls = {
    "spring-security.md": "https://docs.spring.io/spring-security/reference/index.html",
    "spring-authorization-server.md": "https://docs.spring.io/spring-authorization-server/reference/overview.html",
    "spring-boot.md": "https://docs.spring.io/spring-boot/index.html",
    "spring-graphql.md": "https://docs.spring.io/spring-graphql/reference/index.html",
    "spring-framework.md": "https://docs.spring.io/spring-framework/reference/overview.html",
    "spring-data-commons.md": "https://docs.spring.io/spring-data/commons/reference/index.html",
    "spring-data-jpa.md": "https://docs.spring.io/spring-data/jpa/reference/",
    "spring-amqp.md": "https://docs.spring.io/spring-amqp/reference/",
    "spring-integration.md": "https://docs.spring.io/spring-integration/reference/",
    "spring-modulith.md": "https://docs.spring.io/spring-modulith/reference/index.html"
}

for output_file, base_url in urls.items():
    process_website(base_url, f"data/ephemeral/spring-doc/{output_file}")


Found 10 pages to process.
Processing (1/10): https://docs.spring.io/spring-modulith/reference/index.html
Processing (2/10): https://docs.spring.io/spring-modulith/reference/fundamentals.html
Processing (3/10): https://docs.spring.io/spring-modulith/reference/verification.html
Processing (4/10): https://docs.spring.io/spring-modulith/reference/events.html
Processing (5/10): https://docs.spring.io/spring-modulith/reference/testing.html
Processing (6/10): https://docs.spring.io/spring-modulith/reference/moments.html
Processing (7/10): https://docs.spring.io/spring-modulith/reference/documentation.html
Processing (8/10): https://docs.spring.io/spring-modulith/reference/runtime.html
Processing (9/10): https://docs.spring.io/spring-modulith/reference/production-ready.html
Processing (10/10): https://docs.spring.io/spring-modulith/reference/appendix.html

Done! Documentation saved to data/ephemeral/spring-doc/spring-modulith.md
