In [None]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def is_valid(url):
    """
    Check whether the URL is valid.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_domain(url):
    """
    Extract the domain (netloc) from the URL.
    """
    return urlparse(url).netloc

def save_page(url, content, root_folder="downloaded_pages"):
    """
    Save the page content to a file under the specified folder.
    The filename is generated based on the domain and URL path.
    """
    if not os.path.exists(root_folder):
        os.makedirs(root_folder)
    
    parsed = urlparse(url)
    path = parsed.path.strip("/")
    if not path:
        filename = "index.html"
    else:
        # Replace slashes with underscores to form a valid filename.
        filename = path.replace("/", "_") + ".html"
    
    file_path = os.path.join(root_folder, f"{parsed.netloc}_{filename}")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(content)
    print(f"Saved: {url} -> {file_path}")


In [None]:
def crawl(url, domain, visited, max_depth=2, depth=0):
    """
    Recursively crawl pages starting from the given URL.
    
    Parameters:
    - url: current URL to crawl.
    - domain: root domain to limit crawling.
    - visited: a set of already visited URLs.
    - max_depth: maximum recursion depth to avoid overly deep crawling.
    - depth: current recursion depth.
    """
    if depth > max_depth:
        return
    if url in visited:
        return
    
    visited.add(url)
    print("Crawling:", url)
    
    try:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to retrieve {url} (Status code: {response.status_code})")
            return
        
        content = response.text
        save_page(url, content)
        
        # Parse the HTML to find all link tags.
        soup = BeautifulSoup(content, "html.parser")
        for link in soup.find_all("a", href=True):
            href = link.get("href")
            # Build the absolute URL from the href attribute.
            next_url = urljoin(url, href)
            if not is_valid(next_url):
                continue
            # Only crawl links within the same domain.
            if get_domain(next_url) == domain:
                crawl(next_url, domain, visited, max_depth, depth + 1)
                
    except Exception as e:
        print("Error while crawling", url, ":", e)


In [None]:
# Set the root URL you want to start crawling from.
root_url = "https://example.com"  # <-- Change this to your target URL

# Extract the domain from the root URL.
domain = get_domain(root_url)

# Use a set to keep track of visited URLs to avoid loops.
visited = set()

# Set the maximum depth of recursion.
max_depth = 200  # You can adjust this value as needed.

# Start the crawling process.
crawl(root_url, domain, visited, max_depth=max_depth)
