In [10]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def is_valid(url):
    """
    Check whether the URL is valid.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_domain(url):
    """
    Extract the domain (netloc) from the URL.
    """
    return urlparse(url).netloc

def save_page(url, content, root_folder="downloaded_pages"):
    """
    Save the page content to a file under the specified folder.
    The filename is generated based on the domain and URL path.
    """
    if not os.path.exists(root_folder):
        os.makedirs(root_folder)
    
    parsed = urlparse(url)
    path = parsed.path.strip("/")
    if not path:
        filename = "index.html"
    else:
        # Replace slashes with underscores to form a valid filename.
        filename = path.replace("/", "_") + ".html"
    
    file_path = os.path.join(root_folder, f"{parsed.netloc}_{filename}")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(content)
    print(f"Saved: {url} -> {file_path}")


In [11]:
def crawl(url, domain, visited, driver, max_depth=2, depth=0):
    """
    Recursively crawl pages starting from the given URL.
    
    Parameters:
    - url: current URL to crawl.
    - domain: root domain to limit crawling.
    - visited: a set of already visited URLs.
    - driver: Selenium WebDriver instance.
    - max_depth: maximum recursion depth to avoid overly deep crawling.
    - depth: current recursion depth.
    """
    if depth > max_depth:
        return
    if url in visited:
        return
    
    visited.add(url)
    print("Crawling:", url)
    
    try:
        # Use Selenium to get the page with JavaScript execution
        driver.get(url)
        
        # Wait for the page to load completely
        WebDriverWait(driver, 10).until(lambda d: d.execute_script('return document.readyState') == 'complete')
        
        # Get the rendered content
        content = driver.page_source
        save_page(url, content)
        
        # Parse the HTML to find all link tags.
        soup = BeautifulSoup(content, "html.parser")
        for link in soup.find_all("a", href=True):
            href = link.get("href")
            # Build the absolute URL from the href attribute.
            next_url = urljoin(url, href)
            if not is_valid(next_url):
                continue
            # Only crawl links within the same domain.
            if get_domain(next_url) == domain:
                crawl(next_url, domain, visited, driver, max_depth, depth + 1)
                
    except Exception as e:
        print("Error while crawling", url, ":", e)


In [12]:
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize the Chrome WebDriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Set the root URL you want to start crawling from.
root_url = "https://catalog.clarku.edu/content.php?catoid=34&navoid=2847&print"  # <-- Change this to your target URL

# Extract the domain from the root URL.
domain = get_domain(root_url)

# Use a set to keep track of visited URLs to avoid loops.
visited = set()

# Set the maximum depth of recursion.
max_depth = 1  # You can adjust this value as needed.

try:
    # Start the crawling process.
    crawl(root_url, domain, visited, driver, max_depth=max_depth)
finally:
    # Make sure the browser is closed when done
    driver.quit()


Crawling: https://catalog.clarku.edu/content.php?catoid=34&navoid=2847&print
Saved: https://catalog.clarku.edu/content.php?catoid=34&navoid=2847&print -> downloaded_pages/catalog.clarku.edu_content.php.html
Crawling: https://catalog.clarku.edu/content.php?catoid=34&navoid=2847&print#acalog-content
Saved: https://catalog.clarku.edu/content.php?catoid=34&navoid=2847&print#acalog-content -> downloaded_pages/catalog.clarku.edu_content.php.html
Crawling: https://catalog.clarku.edu/content.php?catoid=34&navoid=2847&print#content
Saved: https://catalog.clarku.edu/content.php?catoid=34&navoid=2847&print#content -> downloaded_pages/catalog.clarku.edu_content.php.html
Crawling: http://catalog.clarku.edu/
Saved: http://catalog.clarku.edu/ -> downloaded_pages/catalog.clarku.edu_index.html
Crawling: https://catalog.clarku.edu/offices/campus-safety-and-security/
Saved: https://catalog.clarku.edu/offices/campus-safety-and-security/ -> downloaded_pages/catalog.clarku.edu_offices_campus-safety-and-secu

KeyboardInterrupt: 