In [2]:
import concurrent.futures
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd


# Function to sanitize sheet names by removing invalid characters
def sanitize_sheet_name(name):
    return re.sub(r'[\\/*?:"<>|]', "", name)


# Set up Selenium WebDriver with optimizations
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument(
        "--headless"
    )  # Run in headless mode (no browser window)
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")  # Disable GPU to save resources
    chrome_options.add_argument(
        "--blink-settings=imagesEnabled=false"
    )  # Disable images

    # Replace with the correct path to chromedriver
    service = Service(
        r"C:\Users\Nishant shah\OneDrive\Desktop\Nishant\Software\chromedriver.exe"
    )
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver


# Function to scrape a single URL
def scrape_url(url):
    driver = setup_driver()
    print(f"Processing URL: {url}")  # Print the URL being processed
    driver.get(f"https://www.factsonacts.nl{url}")

    # Wait for the category element to appear
    try:
        category = (
            WebDriverWait(driver, 10)
            .until(EC.presence_of_element_located((By.CLASS_NAME, "uk-breadcrumb")))
            .text.strip()
        )
        print(f"Category found: {category}")
    except:
        category = "N/A"
        print("No category found, setting as 'N/A'")

    # Extract all details
    details = driver.find_elements(By.CLASS_NAME, "uk-grid-margin")

    data = []
    for i, detail in enumerate(details, start=1):
        # Extract title
        try:
            title = detail.find_element(By.TAG_NAME, "h3").text.strip()
            print(f"Extracted title: {title}")
        except:
            title = "N/A"
            print(f"Failed to extract title for entry {i}, setting as 'N/A'")

        # Extract website link (using more specific XPath)
        try:
            website_link = detail.find_element(
                By.XPATH,
                ".//a[contains(@href, 'http') and (contains(text(), 'Website') or contains(@class, 'uk-button'))]",
            ).get_attribute("href")
            print(f"Extracted website: {website_link}")
        except:
            website_link = "N/A"
            print(f"Failed to extract website for entry {i}, setting as 'N/A'")

        # Extract email (using mailto link)
        try:
            email_link = detail.find_element(
                By.XPATH, ".//a[contains(@href, 'mailto:')]"
            ).get_attribute("href")
            email_link = email_link.replace("mailto:", "")
            print(f"Extracted email: {email_link}")
        except:
            email_link = "N/A"
            print(f"Failed to extract email for entry {i}, setting as 'N/A'")

        # Store the data
        data.append(
            {
                "Category": category,
                "Title": title,
                "Website": website_link,
                "Email": email_link,
            }
        )

    driver.quit()  # Close the browser after processing the URL
    return data


# List of URLs to scrape
urls = [
    "/web/kvp/1/",
    "/web/kvp/3/",
    "/web/kvp/6/",
    "/web/kvp/12/",
    "/web/kvp/17/",
    "/web/kvp/19/",
    "/web/kvp/20/",
    "/web/kvp/23/",
    "/web/kvp/25/",
    "/web/kvp/26/",
    "/web/kvp/28/",
    "/web/kvp/32/",
    "/web/kvp/34/",
    "/web/kvp/35/",
    "/web/kvp/36/",
    "/web/kvp/37/",
    "/web/kvp/38/",
    "/web/kvp/41/",
    "/web/kvp/42/",
    "/web/kvp/45/",
    "/web/kvp/47/",
    "/web/kvp/48/",
    "/web/kvp/49/",
    "/web/kvp/50/",
    "/web/kvp/51/",
    "/web/kvp/55/",
    "/web/kvp/59/",
    "/web/kvp/60/",
    "/web/kvp/69/",
    "/web/kvp/71/",
    "/web/kvp/72/",
    "/web/kvp/73/",
    "/web/kvp/74/",
    "/web/kvp/75/",
    "/web/kvp/76/",
    "/web/kvp/77/",
    "/web/kvp/78/",
    "/web/kvp/79/",
    "/web/kvp/80/",
    "/web/kvp/83/",
    "/web/kvp/84/",
    "/web/kvp/85/",
    "/web/kvp/86/",
    "/web/kvp/87/",
    "/web/kvp/92/",
    "/web/kvp/95/",
    "/web/kvp/96/",
    "/web/kvp/102/",
    "/web/kvp/103/",
    "/web/kvp/112/",
    "/web/kvp/116/",
    "/web/kvp/118/",
    "/web/kvp/119/",
    "/web/kvp/127/",
    "/web/kvp/132/",
    "/web/kvp/134/",
    "/web/kvp/136/",
    "/web/kvp/137/",
    "/web/kvp/146/",
    "/web/kvp/147/",
    "/web/kvp/154/",
    "/web/kvp/156/",
    "/web/kvp/157/",
    "/web/kvp/160/",
    "/web/kvp/165/",
    "/web/kvp/169/",
    "/web/kvp/179/",
    "/web/kvp/194/",
    "/web/kvp/204/",
    "/web/kvp/205/",
    "/web/kvp/226/",
    "/web/kvp/228/",
    "/web/kvp/247/",
    "/web/kvp/287/",
    "/web/kvp/290/",
    "/web/kvp/292/",
    "/web/kvp/293/",
    "/web/kvp/297/",
    "/web/kvp/301/",
    "/web/kvp/305/",
    "/web/kvp/313/",
    "/web/kvp/328/",
    "/web/kvp/329/",
    "/web/kvp/330/",
    "/web/kvp/331/",
    "/web/kvp/332/",
    "/web/kvp/334/",
    "/web/kvp/337/",
    "/web/kvp/343/",
    "/web/kvp/347/",
    "/web/kvp/353/",
    "/web/kvp/361/",
    "/web/kvp/364/",
    "/web/kvp/366/",
    "/web/kvp/368/",
    "/web/kvp/375/",
    "/web/kvp/376/",
    "/web/kvp/377/",
    "/web/kvp/379/",
    "/web/kvp/383/",
    "/web/kvp/384/",
    "/web/kvp/385/",
    "/web/kvp/386/",
    "/web/kvp/389/",
    "/web/kvp/390/",
    "/web/kvp/391/",
    "/web/kvp/392/",
    "/web/kvp/393/",
    "/web/kvp/394/",
    "/web/kvp/395/",
    "/web/kvp/398/",
    "/web/kvp/399/",
    "/web/kvp/400/",
    "/web/kvp/401/",
    "/web/kvp/403/",
    "/web/kvp/406/",
    "/web/kvp/407/",
    "/web/kvp/410/",
    "/web/kvp/411/",
    "/web/kvp/412/",
    "/web/kvp/414/",
    "/web/kvp/500/",
    "/web/kvp/501/",
    "/web/kvp/505/",
    "/web/kvp/510/",
    "/web/kvp/525/",
    "/web/kvp/551/",
    "/web/kvp/553/",
    "/web/kvp/600/",
]

# Create an empty dictionary to store data by category
data_by_category = {}

# Use concurrent futures for parallel processing
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Map the scraping function to the URLs and retrieve results in parallel
    results = executor.map(scrape_url, urls)

    # Process results and group by category
    for result in results:
        for entry in result:
            category = entry["Category"]
            if category not in data_by_category:
                data_by_category[category] = []
            data_by_category[category].append(entry)

# Save data to Excel with separate sheets for each category
with pd.ExcelWriter("scraped_data_parallel.xlsx", engine="xlsxwriter") as writer:
    for category, data in data_by_category.items():
        sanitized_category = sanitize_sheet_name(category)[
            :31
        ]  # Ensure sheet name length is within 31 characters
        df = pd.DataFrame(data)
        df.to_excel(
            writer, sheet_name=sanitized_category, index=False
        )  # Sheet name max length is 31 chars

print("Data has been saved to 'scraped_data_parallel.xlsx'")

Processing URL: /web/kvp/6/
Processing URL: /web/kvp/1/
Processing URL: /web/kvp/3/
Processing URL: /web/kvp/17/
Processing URL: /web/kvp/12/
Category found: Rubriek: Beveiliging
Extracted title: ABO BEVEILIGING B.V.
Extracted website: https://www.abo-beveiliging.nl/
Extracted email: info@abo-beveiliging.nl
Extracted title: ALL4EVENTS & ENTERTAINMENT
Extracted website: https://a4e.nl/
Extracted email: erik@A4E.nl
Extracted title: ALL4EVENTS.NL
Extracted website: https://www.all4events.nl/
Extracted email: mail@All4Events.nl
Extracted title: ALPHA SECURITY INT.
Extracted website: https://www.alphasecurity.nl/
Extracted email: info@alphasecurity.nl
Extracted title: ANDOR SECURITY SERVICE
Extracted website: https://www.andor-security.nl/
Extracted email: info@andor-security.nl
Extracted title: AVAQ NEDERLAND
Extracted website: https://www.avaq.eu/
Extracted email: info@avaq.eu
Extracted title: BLOEMINK SERVICE
Extracted website: https://bloeminksecurity.nl/
Extracted email: info@bloeminks