In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urlparse, urljoin, unquote
import os
import time
import requests
from bs4 import BeautifulSoup
import re

# Configuration
BASE_URL = "https://homeradar.kwst.net/"
OUTPUT_DIR = "templates_scraped"
ASSETS_DIR = os.path.join(OUTPUT_DIR, "assets")
CSS_DIR = os.path.join(ASSETS_DIR, "css")
JS_DIR = os.path.join(ASSETS_DIR, "js")
IMG_DIR = os.path.join(ASSETS_DIR, "img")
WAIT_TIME = 10

# Setup headless Chrome
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)
driver.get(BASE_URL)

# Wait for page to load
try:
    WebDriverWait(driver, WAIT_TIME).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
except Exception as e:
    print("Failed to load page:", e)
    driver.quit()
    exit()

# Create output folders
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(CSS_DIR, exist_ok=True)
os.makedirs(JS_DIR, exist_ok=True)
os.makedirs(IMG_DIR, exist_ok=True)

# Track saved pages to avoid duplicates
saved_pages = set()

# Helper to save assets
def download_asset(url, folder):
    if url.startswith("data:"):
        return url  # Skip inline data URIs
    try:
        filename = os.path.basename(url.split("?")[0])
        filename = unquote(filename)
        if not filename:
            filename = "unnamed"
        path = os.path.join(folder, filename)
        if not os.path.exists(path):
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
                "Referer": BASE_URL,
                "Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
                "Accept-Encoding": "gzip, deflate, br"
            }
            response = requests.get(url, headers=headers, timeout=15)
            response.raise_for_status()
            with open(path, "wb") as f:
                f.write(response.content)
        return os.path.relpath(path, OUTPUT_DIR).replace("\\", "/")
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return url

# Normalize file name from URL
def normalize_filename_from_url(url):
    parsed = urlparse(url)
    path = parsed.path.strip("/") or "index"
    if path.endswith("/"):
        path = path[:-1]
    if not path.endswith(".html"):
        path += ".html"
    filename = re.sub(r"[^a-zA-Z0-9_\-]", "_", path)
    return filename

# Save and process a page
def save_page(link):
    page_name = normalize_filename_from_url(link)
    if page_name in saved_pages:
        print(f"Skipping duplicate: {page_name}")
        return
    saved_pages.add(page_name)
    try:
        driver.get(link)
        WebDriverWait(driver, WAIT_TIME).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(2)
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        # Remove <base> tags if any
        for base_tag in soup.find_all("base"):
            base_tag.decompose()

        # Download assets and update links
        for tag in soup.find_all(["link", "script", "img"]):
            attr = "href" if tag.name == "link" else "src"
            if tag.has_attr(attr):
                asset_url = urljoin(link, tag[attr])
                if asset_url.endswith(".css"):
                    local_path = download_asset(asset_url, CSS_DIR)
                elif asset_url.endswith(".js"):
                    local_path = download_asset(asset_url, JS_DIR)
                elif any(asset_url.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"]):
                    local_path = download_asset(asset_url, IMG_DIR)
                else:
                    local_path = download_asset(asset_url, ASSETS_DIR)
                tag[attr] = local_path

        # Save page with cleaned name
        filename = os.path.join(OUTPUT_DIR, page_name)
        if not filename.endswith(".html"):
            filename += ".html"  # Final safeguard
        with open(filename, "w", encoding="utf-8") as f:
            f.write(str(soup))
        print(f"Saved: {filename}")
    except Exception as e:
        print(f"Error saving page {link}: {e}")

# Save base page
save_page(BASE_URL)

# Collect internal links
links = set()
anchors = driver.find_elements(By.TAG_NAME, "a")
for a in anchors:
    href = a.get_attribute("href")
    if href and urlparse(href).netloc == urlparse(BASE_URL).netloc:
        links.add(href.split("#")[0])  # Remove fragments

print(f"Found {len(links)} internal links.")

# Save linked pages
for link in links:
    save_page(link)

# Cleanup
driver.quit()
print("Scraping complete.")

Saved: templates_scraped\index_html.html
Found 30 internal links.
Saved: templates_scraped\listing-single3_html.html
Saved: templates_scraped\listing3_html.html
Saved: templates_scraped\pricing_html.html
Saved: templates_scraped\listing-single2_html.html
Saved: templates_scraped\listing_html.html
Saved: templates_scraped\agency-single_html.html
Saved: templates_scraped\blog-single_html.html
Saved: templates_scraped\listing-single_html.html
Skipping duplicate: index_html
Saved: templates_scraped\listing2_html.html
Saved: templates_scraped\about_html.html
Saved: templates_scraped\dark_index_html.html
Saved: templates_scraped\agent-single_html.html
Saved: templates_scraped\agency-list_html.html
Saved: templates_scraped\index3_html.html
Saved: templates_scraped\contacts_html.html
Saved: templates_scraped\listing4_html.html
Saved: templates_scraped\agent-list_html.html
Saved: templates_scraped\listing5_html.html
Saved: templates_scraped\listing6_html.html
Saved: templates_scraped\blog_html.