In [12]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import re
import json
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

In [20]:
def init_selenium():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    # Initialize the WebDriver with the service
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    return driver

# Extract links from a page asynchronously
async def fetch_links(session, url):
    try:
        async with session.get(url) as response:
            if response.status == 200:
                html = await response.text()
                soup = BeautifulSoup(html, 'html.parser')
                return [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
            else:
                return []
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return []

# Selenium fallback for JS-rendered pages
def fetch_links_selenium(driver, url):
    try:
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        return [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
    except Exception as e:
        print(f"Error fetching {url} with Selenium: {e}")
        return []

# Filter product URLs
def filter_product_urls(urls):
    product_patterns = [r"/product/", r"/item/", r"/p/", r"/dp/", r"/gp/product/"]
    return [url for url in urls if any(re.search(pattern, url) for pattern in product_patterns)]

# Crawl a domain
async def crawl_domain(domain, driver):
    discovered_urls = set()
    product_urls = set()
    
    async with aiohttp.ClientSession() as session:
        queue = [domain]

        while queue:
            url = queue.pop(0)

            if len(product_urls) > 100: break

            if url not in discovered_urls:
                discovered_urls.add(url)

                # Fetch links asynchronously
                links = await fetch_links(session, url)

                # Fallback to Selenium for JS-rendered pages
                if not links:
                    links = fetch_links_selenium(driver, url)

                # Filter product URLs
                fl = filter_product_urls(links)
                product_urls.update(fl)

                # Add new links to the queue
                queue.extend(link for link in links if link not in discovered_urls)

    return list(product_urls)

# Main function to orchestrate the crawling process
async def main(domains):
    driver = init_selenium()
    results = {}

    try:
        for domain in domains:
            print(f"Crawling: {domain}")
            results[domain] = await crawl_domain(domain, driver)
    finally:
        driver.quit()
    
    print(results)

    # Save results to a JSON file
    # with open('product_urls.json', 'w') as f:
    #     json.dump(results, f, indent=4)

    # print("Crawling completed. Results saved to 'product_urls.json'")


In [21]:
# List of domains to crawl
domains = [
    # "https://www.flipkart.com/",
    "https://www.amazon.in/",
    # Add more domains here
]

# Run the crawler
await main(domains)

Crawling: https://www.amazon.in/
{'https://www.amazon.in/': ['https://www.amazon.in/SJEWARE-Pairs-Solid-Ankle-Multicolor/dp/B0BZVZYXV5?_encoding=UTF8&pd_rd_w=uy6nP&content-id=amzn1.sym.211684f4-ebe1-443f-8a4a-0773471e979f&pf_rd_p=211684f4-ebe1-443f-8a4a-0773471e979f&pf_rd_r=FK2PCTH9K5TVJYSEVMWS&pd_rd_wg=IXJ5o&pd_rd_r=44543a73-bfae-4ef3-b24c-f44599900f04&ref_=pd_hp_d_btf_crs_zg_bs_1571271031', 'https://www.amazon.in/BLINKIN-Velvet-Touch-Thermal-Tights/dp/B0DF2WPXV2?_encoding=UTF8&pd_rd_w=uy6nP&content-id=amzn1.sym.211684f4-ebe1-443f-8a4a-0773471e979f&pf_rd_p=211684f4-ebe1-443f-8a4a-0773471e979f&pf_rd_r=FK2PCTH9K5TVJYSEVMWS&pd_rd_wg=IXJ5o&pd_rd_r=44543a73-bfae-4ef3-b24c-f44599900f04&ref_=pd_hp_d_btf_crs_zg_bs_1571271031', 'https://www.amazon.in/FUNDAY-FASHION-Printed-Oversized-T-Shirt/dp/B0B1VK7Q12?_encoding=UTF8&pd_rd_w=WmG9O&content-id=amzn1.sym.6a567e3d-fd9a-4932-aa05-d0107e1bcce7&pf_rd_p=6a567e3d-fd9a-4932-aa05-d0107e1bcce7&pf_rd_r=FK2PCTH9K5TVJYSEVMWS&pd_rd_wg=IXJ5o&pd_rd_r=44543a73