In [5]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os

# Base URL of the site
base_url = "https://docs.ros.org/"

# Set to store visited URLs to prevent duplication
visited_urls = set()

# Directory to save scraped data
output_dir = "scraped_docs"
os.makedirs(output_dir, exist_ok=True)

def save_to_file(url, content):
    """
    Save scraped content to a text file.
    :param url: The URL of the page
    :param content: The content to write to the file
    """
    # Generate a valid filename from the URL
    parsed_url = urlparse(url)
    filename = f"{parsed_url.netloc}_{parsed_url.path.replace('/', '_').strip('_')}.txt"
    file_path = os.path.join(output_dir, filename)

    # Write content to the file
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(content)
    print(f"Saved content to {file_path}")

def scrape_page_dfs(url, depth=1, max_depth=3):
    """
    Perform DFS scraping of the given URL.
    :param url: URL to scrape
    :param depth: Current depth level
    :param max_depth: Maximum depth to recurse into links
    """
    if depth > max_depth or url in visited_urls:
        return

    # Add to visited URLs
    visited_urls.add(url)

    try:
        # Fetch the page content
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            print(f"Failed to access {url} (Status code: {response.status_code})")
            return

        print(f"\nScraping {url} (Depth: {depth})")
        soup = BeautifulSoup(response.content, "html.parser")

        # Prepare content for saving
        content = f"URL: {url}\n"
        title = soup.title.string if soup.title else "No title"
        content += f"Page Title: {title}\n\n"

        # Extract and add headings
        headings = soup.find_all(['h1', 'h2'])
        content += "Headings:\n"
        for heading in headings:
            content += f"- {heading.text.strip()}\n"

        # Extract and add paragraphs
        paragraphs = soup.find_all('p')
        content += "\nParagraphs:\n"
        for paragraph in paragraphs[:3]:  # Limit to first 3 for brevity
            content += f"- {paragraph.text.strip()}\n"

        # Save content to a file
        save_to_file(url, content)

        # Extract internal links and perform DFS
        print("\nFollowing links:")
        links = soup.find_all('a', href=True)
        for link in links:
            href = link['href']
            # Convert relative URLs to absolute URLs
            full_url = urljoin(url, href)
            # Ensure the link is within the docs.ros.org domain and not visited
            if base_url in full_url and full_url not in visited_urls:
                print(f"Found link: {full_url}")
                scrape_page_dfs(full_url, depth + 1, max_depth)

    except Exception as e:
        print(f"Error scraping {url}: {e}")

# Start DFS scraping from the base URL
scrape_page_dfs(base_url, depth=1, max_depth=3)



Scraping https://docs.ros.org/ (Depth: 1)
Saved content to scraped_docs/docs.ros.org_.txt

Following links:
Found link: https://docs.ros.org/en/humble

Scraping https://docs.ros.org/en/humble (Depth: 2)
Saved content to scraped_docs/docs.ros.org_en_humble.txt

Following links:
Found link: https://docs.ros.org/en/Installation.html
Failed to access https://docs.ros.org/en/Installation.html (Status code: 404)
Found link: https://docs.ros.org/en/Installation/Ubuntu-Install-Debs.html
Failed to access https://docs.ros.org/en/Installation/Ubuntu-Install-Debs.html (Status code: 404)
Found link: https://docs.ros.org/en/Installation/Windows-Install-Binary.html
Failed to access https://docs.ros.org/en/Installation/Windows-Install-Binary.html (Status code: 404)
Found link: https://docs.ros.org/en/Installation/RHEL-Install-RPMs.html
Failed to access https://docs.ros.org/en/Installation/RHEL-Install-RPMs.html (Status code: 404)
Found link: https://docs.ros.org/en/Installation/Alternatives.html
Fail