In [None]:
# Import necessary libraries for web scraping, selenium, and pydantic validation
import random, time, undetected_chromedriver as uc, os

from typing import List, Optional, Union, Dict
from urllib.parse import quote_plus

from bs4 import BeautifulSoup, Tag

from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

from pydantic import BaseModel, HttpUrl, Field

from dotenv import load_dotenv

from logger import logger
from utils.common import save_as, soup_maker, load_and_scroll, pagination

load_dotenv()

True

In [None]:
class SearchProduct(BaseModel):
    title: str = Field(..., min_length=3)
    link: HttpUrl
    image: Optional[HttpUrl]
    price: Optional[float]
    rating: Optional[float]
    total_ratings: Optional[int]

def pydentic_(results:List[Dict[str, str]]) -> List[SearchProduct]:
    search_products = []
    for result in results:
        item = SearchProduct(
            title=result["Title"],
            image=result["Image"],
            link=result["Link"],
            price=None,
            rating=None,
            total_ratings=None
        )
        search_products.append(item)
        logger.info(f"ℹ️ Product {item.title} successfully extracted")
    return search_products




In [43]:
class ScraperConfig:
    """
    Scraper configuration manager for initializing Selenium WebDriver instances.

    This class supports various configurations such as:
    - Headless mode
    - Incognito mode
    - Proxy configuration (ScrapeOps)
    - SeleniumWire support
    - Random User-Agent rotation

    Attributes:
        SCRAPEOPS_API_KEY (str): API key for ScrapeOps service.
        use_uc (bool): Whether to use undetected_chromedriver (UC).
        headless (bool): Whether to enable headless mode for the WebDriver.
        incognito (bool): Whether to start the browser in incognito mode.
        user_agent (Optional[str]): Custom User-Agent string for the WebDriver.
        use_scrapeops (bool): Whether to use ScrapeOps proxy service.
        use_seleniumwire (bool): Whether to use SeleniumWire for intercepting requests.
        proxy (Optional[str]): Proxy URL for ScrapeOps (if applicable).
        user_agents (List[str]): List of User-Agent strings for random rotation.
        random_user_agent (str): A randomly selected or custom User-Agent string.
        driver (Union[webdriver.Chrome, "seleniumwire.webdriver.Chrome", uc.Chrome]):
            The initialized WebDriver instance.
    """

    SCRAPEOPS_API_KEY: str =  os.getenv("SCRAPEOPS_API_KEY")

    def __init__(
        self,
        use_uc: bool = False,
        headless: bool = False,
        incognito: bool = True,
        user_agent: Optional[str] = None,
        use_scrapeops: bool = False,
        use_seleniumwire: bool = False,
    ) -> None:
        """
        Initializes the ScraperConfig object with the given configuration parameters.

        Args:
            use_uc (bool): Whether to use undetected_chromedriver (UC).
            headless (bool): Whether to run the browser in headless mode.
            incognito (bool): Whether to run the browser in incognito mode.
            user_agent (Optional[str]): Custom User-Agent to use.
            use_scrapeops (bool): Whether to use ScrapeOps proxy service.
            use_seleniumwire (bool): Whether to use SeleniumWire.

        Initializes:
            Sets the attributes based on the passed configuration.
            Initializes the WebDriver according to the selected options.
        """
        self.use_uc = use_uc
        self.headless = headless
        self.incognito = incognito
        self.use_scrapeops = use_scrapeops
        self.use_seleniumwire = use_seleniumwire

        self.uc_options = uc.ChromeOptions()
        self.chrome_options = ChromeOptions()

        self.proxy = (
            f"http://scrapeops.headless_browser_mode=true:{self.SCRAPEOPS_API_KEY}@proxy.scrapeops.io:5353"
            if self.use_scrapeops else None
        )


        self.user_agents: List[str] = self._load_user_agents()
        self.random_user_agent: str = (
            user_agent or
            random.choice(self.user_agents)
        )

        self.driver = self._init_driver()


    def _init_driver(
        self,
    ) -> Union[webdriver.Chrome, "seleniumwire.webdriver.Chrome", uc.Chrome]:
        """
        Initializes the appropriate driver based on the configuration.

        Returns:
            Union[webdriver.Chrome, seleniumwire.webdriver.Chrome, uc.Chrome]:
            The configured WebDriver instance.

        Notes:
            This function selects the appropriate driver based on whether
            undetected_chromedriver (UC) is enabled or a regular Chrome driver
            is to be used, with or without SeleniumWire.
        """
        if self.use_uc:
            logger.info("⚙️ Using undetected_chromedriver (UC)")
            return self._get_uc_driver()

        logger.info("⚙️ Using standard Chrome driver")
        return self._get_normal_driver()

    def _get_uc_driver(self) -> uc.Chrome:
        """
        Configures and returns an undetected_chromedriver (UC) instance.

        Returns:
            uc.Chrome: The configured undetected Chrome driver.

        Notes:
            This method configures the driver with additional stealth options
            to avoid detection as a bot, such as disabling automation features.
        """
        self._apply_common_options(self.uc_options)

        # Additional stealth options for UC
        self.uc_options.add_argument\
            ("--disable-blink-features=AutomationControlled")

        return uc.Chrome(options=self.uc_options)

    def _get_normal_driver(
        self,
    ) -> Union[webdriver.Chrome, "seleniumwire.webdriver.Chrome"]:
        """
        Configures and returns a standard Chrome WebDriver.

        Returns:
            Union[webdriver.Chrome, seleniumwire.webdriver.Chrome]:
            The configured normal Chrome driver.

        Notes:
            If SeleniumWire is enabled, this function configures the driver
            with SeleniumWire proxy options for intercepting requests.
        """
        self._apply_common_options(self.chrome_options)

        if self.use_seleniumwire:
            from seleniumwire import webdriver as wire_webdriver

            seleniumwire_options = (
                {
                    "proxy": {
                        "http": self.proxy,
                        "https": self.proxy,
                        "no_proxy": "localhost,127.0.0.1",
                    }
                }
                if self.proxy
                else {}
            )

            return wire_webdriver.Chrome(
                options=self.chrome_options,
                seleniumwire_options=seleniumwire_options
            )

        return webdriver.Chrome(
            service=ChromeService(ChromeDriverManager().install()),
            options=self.chrome_options,
        )

    def _apply_common_options(
        self, options: Union[ChromeOptions, uc.ChromeOptions]
    ) -> None:
       """
        Applies common options for Chrome based on the current configuration.

        Args:
            options (Union[ChromeOptions, uc.ChromeOptions]):
            The options object to configure.

        This function adds configurations for:
        - Headless mode
        - Incognito mode
        - Proxy settings (if any)
        - Other browser performance-related settings
        """
       if self.headless:
           options.add_argument("--headless=new")
       if self.incognito:
           options.add_argument("--incognito")
       if self.proxy:
           options.add_argument(f"--proxy-server={self.proxy}")
       options.add_argument("--no-sandbox")
       options.add_argument("--disable-dev-shm-usage")
       options.add_argument("--disable-popup-blocking")
       options.add_argument("--disable-infobars")
       options.add_argument("--start-maximized")
       options.add_argument("--window-size=1920,1080")
       options.add_argument(f"user-agent={self.random_user_agent}")


    def _load_user_agents(self) -> List[str]:
        """
        Loads a list of User-Agent strings.

        Returns:
            List[str]: List of user-agent strings.
        """
        return [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Linux; Android 14; SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",
            "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Version/17.0 Mobile Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/122.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/120.0 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0",
            "Mozilla/5.0 (Linux; Android 14; SM-S918B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",
            "Mozilla/5.0 (Linux; Android 13; Pixel 7 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Mobile Safari/537.36",
            "Mozilla/5.0 (Linux; Android 12; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Mobile Safari/537.36",
            "Mozilla/5.0 (Android 13; Mobile; rv:120.0) Gecko/120.0 Firefox/120.0",
            "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/537.36",
            "Mozilla/5.0 (iPhone; CPU iPhone OS 16_5 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) CriOS/120.0.0.0 Mobile/15E148 Safari/537.36",
            "Mozilla/5.0 (iPad; CPU OS 16_1 like Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Version/16.1 Mobile/15E148 Safari/537.36"
        ]


In [None]:


class ProductExtractor:
    def __init__(self, soup: BeautifulSoup, base_url: str):
        self.soup = soup
        self.base_url = base_url

    def list_items(self) -> List[Tag]:
        """Locate all product list items from the soup."""
        try:
            items = self.soup.select('div[role="listitem"]') or []
            logger.info(f"✅ Found {len(items)} product items")
            return items
        except Exception as e:
            logger.error(f"❌ Error locating list items: {e}")
            return []


    @staticmethod
    def extract_text(item: Tag, selector: str, attr: str = None) -> str:
        """Extract text or attribute value from an HTML element."""
        try:
            element = item.find(selector) or ""
            return element.get(attr) if attr else element.text.strip()
        except Exception as e:
            logger.warning(f"⚠️ Extraction failed for selector '{selector}': {e}")
            return ""


    def extract_field(self, item: Tag, field_type: str) -> str:
        """Extract specific field (title, image, link) from product item."""
        field_selectors = {
            'title': "h2",
            'image': "img",
            'link': "a"
        }

        attrs = {
            'image': "src",
            'link': "href",
        }

        selector = field_selectors.get(field_type)
        attr = attrs.get(field_type)  # Will be None for title

        if not selector:
            logger.warning(f"⚠️ Unknown field type: {field_type}")
            return ""

        extracted = self.extract_text(item, selector, attr)

        if field_type == 'link' and extracted:
            return f"{self.base_url}{extracted}"
        return extracted

    def extract(self) -> List[Dict[str, str]]:
        """Main extraction logic."""
        if not self.soup:
            logger.error("❌ No soup provided to extractor")
            return []

        items = self.list_items()
        results = []

        for item in items:
            title = self.extract_field(item, 'title')
            if not title:
                continue  # Skip items with no title

            product = {
                "Title": title,
                "Image": self.extract_field(item, 'image'),
                "Link": self.extract_field(item, 'link'),
            }
            logger.debug(f"📝 Product extracted: {title}")
            results.append(product)

        logger.info(f"✅ Extracted {len(results)} products successfully")
        return results


In [None]:
class AmazonScraper:
    def __init__(self, config: ScraperConfig):
        self.config = config
        self.driver = config.driver
        self.url = "https://www.amazon.com"
        logger.info("🚀 AmazonScraper initialized")

    def get_search_url(self, keyword: str) -> str:
        encoded = quote_plus(keyword)
        search_url = f"{self.url}/s?k={encoded}"
        logger.debug(f"🔗 Generated search URL: {search_url}")
        return search_url

    def scrape_search_results(self, keyword: str, wait_time: int=3) -> str:
        url = self.get_search_url(keyword)
        logger.info(f"🌐 Navigating to search page: {url}")
        try:
            load_and_scroll(self.driver,url)
            logger.info("✅ Page loaded and ready for scraping")
            return self.driver.page_source
        except Exception as e:
            logger.error(f"❌ Error loading page for keyword '{keyword}': {e}")
            return ""

    def scrape_all_pages(self, keyword: str, max_pages=5)-> List[Dict[str, str]]:
        results = []
        response = self.scrape_search_results(keyword)
        soup = soup_maker(response)
        page = 0
        amazon_extractor = ProductExtractor(soup,self.url)

        while soup and page < max_pages: #apply retry here
            data = amazon_extractor.extract()
            results += data

            logger.info(f"📄 Page {page + 1} scraped.")

            next_page_url = pagination(soup,self.url)
            # next_page_url = next_page_url[:-1]+(next_page_url[-1]+1)
            if not next_page_url:
                break

            self.driver.get(next_page_url)
            time.sleep(random.uniform(2, 4))  # can swap with sleeper()
            soup = soup_maker(self.driver.page_source)
            page += 1

        return results

    def quit(self):
        logger.info("🛑 Quitting WebDriver")
        self.driver.quit()

In [None]:
config = ScraperConfig()
amazon = AmazonScraper(config)
driver = amazon.driver

In [None]:
results = amazon.scrape_all_pages("32 gb ram", max_pages=1)

In [None]:
def pydentic_(results:List[Dict[str, str]]) -> List[SearchProduct]:
    search_products = []
    for result in results:
        item = SearchProduct(
            title=result["Title"],
            image=result["Image"],
            link=result["Link"],
            price=None,
            rating=None,
            total_ratings=None
        )
        search_products.append(item)
        logger.info(f"ℹ️ Product {item.title} successfully extracted")
    return search_products



In [None]:
search_products = pydentic_(results)

title='【RGB DDR4 RAM】GIGASTONE Game TURBO 32GB Kit (2x16GB)DDR4 3200MHz PC4-25600 CL16-18-18-40 Intel XMP 2.0 AMD Ryzen 1.35V UDIMM 288 Pin Unbuffered Non ECC High Performance Gaming Desktop Memory - Black' link=HttpUrl('https://www.amazon.com/sspa/click?ie=UTF8&spc=MTo0MDA4NDI4NjQ4NjU1MzUyOjE3NDU5NDAyOTU6c3BfYXRmOjMwMDczOTY2NDU4ODcwMjo6MDo6&url=%2FGIGASTONE-Desktop-DDR4-3200MHz-PC4-25600-Unbuffered%2Fdp%2FB0CB2VGYFW%2Fref%3Dsr_1_1_sspa%3Fdib%3DeyJ2IjoiMSJ9.tw7-N2U1w1gwJjC2CkCPw641JRVXa83WrGdQPf5HIjxWiXIHS2ItFhikmEjMXmkzUh1zw154zCgV1rxT2AeCPbys5hk-js3_N6sAIdaySTaMrtvpckyzB8GPuO87-8oRCqAIrzUdXnmg8j1diWHWKSTMqAw6jT9_UyorKq_GufwjbJpyFU5k02obR2BCBOoXEyC3JpgaS7-LmiUjBiEEpBIw6uRYDVsEIDo-gmPyHpw.TzgiXiQugmV_whUPhnA3v21gYEQVVWqDLC3kLVxKtHA%26dib_tag%3Dse%26keywords%3D32%2Bgb%2Bram%26qid%3D1745940295%26sr%3D8-1-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGY%26psc%3D1') image=HttpUrl('https://m.media-amazon.com/images/I/510V6PbPGrL._AC_UY218_.jpg') price=None rating=None total_ratings=None
title='【DDR

In [None]:
save_as(items=results, file_name="first.json")

In [None]:
amazon.quit()   