In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import time
import os


In [2]:
# Path to your local ChromeDriver
chrome_driver_path = "/Users/mehak/chromedriver/chromedriver"
service = Service(executable_path=chrome_driver_path)

In [3]:
# Set Chrome options (remove headless for debugging)
options = webdriver.ChromeOptions()
# options.add_argument('--headless')  # Uncomment for headless mode

# Initialize WebDriver
driver = webdriver.Chrome(service=service, options=options)

def get_product_info_and_save_html(url, file_name):
    # Navigate to the page
    driver.get(url)

    try:
        # Click on "United States" button if necessary (wait and interact with it)
        usa_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//img[@alt='United States']"))
        )
        usa_button.click()

        # Wait for the page to load fully
        time.sleep(5)  # Give extra time for the page to load fully

        # Get the page source after the page is fully loaded
        page_source = driver.page_source

        # Save the page source to a file
        with open(file_name, 'w', encoding='utf-8') as file:
            file.write(page_source)

        print(f"HTML content successfully saved to {file_name}!")

        # Now search for GTIN, UPC, SKU, and model number in the page source
        search_identifiers_in_html(file_name)

    except Exception as e:
        print(f"Error: {e}")

def search_identifiers_in_html(file_name):
    # Read the saved HTML file
    with open(file_name, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # Define regex patterns to search for identifiers
    patterns = {
        'GTIN-13': re.compile(r'gtin13[\'"]?\s*:\s*[\'"]?(\d+)'),
        'UPC': re.compile(r'upc[\'"]?\s*:\s*[\'"]?(\d+)'),
        'SKU': re.compile(r'sku[\'"]?\s*:\s*[\'"]?(\d+)'),
        'Model Number': re.compile(r'model[\'"]?\s*:\s*[\'"]?([\w-]+)')
    }

    # Search for the identifiers
    found_data = {}
    for key, pattern in patterns.items():
        match = pattern.search(html_content)
        if match:
            found_data[key] = match.group(1)

    # Print the found data
    if found_data:
        print("Extracted Product Identifiers:")
        for key, value in found_data.items():
            print(f"{key}: {value}")
    else:
        print("No product identifiers found.")

# Example usage:
url = "https://www.bestbuy.com/site/ge-top-control-built-in-dishwasher-with-sanitize-cycle-and-dry-boost-52-dba-stainless-steel/6474683.p?skuId=6474683"
file_name = 'bestbuy_page_source.html'
get_product_info_and_save_html(url, file_name)

# Close the browser
driver.quit()

HTML content successfully saved to bestbuy_page_source.html!
Extracted Product Identifiers:
GTIN-13: 0084691863625
SKU: 6474683
Model Number: GDT550PYRFS


In [4]:

# Set Chrome options for headless mode
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Runs Chrome in headless mode
options.add_argument('--no-sandbox')  # Bypass OS security model
options.add_argument('--disable-dev-shm-usage')  # Overcome limited resource problems
options.add_argument("--window-size=1920,1080")  # Set window size for headless mode
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36")  # Real user agent

# Initialize WebDriver in headless mode
driver = webdriver.Chrome(service=service, options=options)

def get_product_info_and_save_html(url, file_name):
    # Navigate to the page
    driver.get(url)
    print("Navigated to the page...")

    try:
        # Wait and interact with "United States" button if it exists
        try:
            usa_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//img[@alt='United States']"))
            )
            usa_button.click()
            print("Clicked 'United States' button.")
        except Exception as e:
            print("No 'United States' button to click, moving on...")

        # Wait for the page to load fully by waiting for a key element (e.g., product title)
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.XPATH, "//h1[contains(@class, 'product-title')]"))
        )
        print("Product page fully loaded...")

        # Get the page source after the page is fully loaded
        page_source = driver.page_source

        # Save the page source to a file
        with open(file_name, 'w', encoding='utf-8') as file:
            file.write(page_source)

        print(f"HTML content successfully saved to {file_name}!")

        # Now search for GTIN, UPC, SKU, and model number in the page source
        search_identifiers_in_html(file_name)

    except Exception as e:
        print(f"Error: {e}")

    finally:
        # Delete the saved HTML file
        if os.path.exists(file_name):
            os.remove(file_name)
            print(f"Deleted the file {file_name}")

def search_identifiers_in_html(file_name):
    # Read the saved HTML file
    with open(file_name, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # Define regex patterns to search for identifiers
    patterns = {
        'GTIN-13': re.compile(r'gtin13[\'"]?\s*:\s*[\'"]?(\d+)'),
        'UPC': re.compile(r'upc[\'"]?\s*:\s*[\'"]?(\d+)'),
        'SKU': re.compile(r'sku[\'"]?\s*[\'"]?(\d+)'),
        'Model Number': re.compile(r'model[\'"]?\s*[\'"]?([\w-]+)')
    }

    # Search for the identifiers
    found_data = {}
    for key, pattern in patterns.items():
        match = pattern.search(html_content)
        if match:
            found_data[key] = match.group(1)

    # Print the found data
    if found_data:
        print("Extracted Product Identifiers:")
        for key, value in found_data.items():
            print(f"{key}: {value}")
    else:
        print("No product identifiers found.")

# Example usage:
url = "https://www.bestbuy.com/site/ge-profile-top-control-smart-built-in-stainless-steel-tub-dishwasher-with-3rd-rack-dedicated-jet-targeted-wash-and-42-dba-stainless-steel/6572540.p?skuId=6572540"
file_name = 'bestbuy_page_source.html'
get_product_info_and_save_html(url, file_name)

# Close the browser
driver.quit()

Navigated to the page...
Clicked 'United States' button.
Error: Message: 
Stacktrace:
0   chromedriver                        0x00000001009c3ed4 cxxbridge1$str$ptr + 1906348
1   chromedriver                        0x00000001009bc344 cxxbridge1$str$ptr + 1874716
2   chromedriver                        0x00000001005d0264 cxxbridge1$string$len + 89492
3   chromedriver                        0x0000000100614514 cxxbridge1$string$len + 368708
4   chromedriver                        0x000000010064e7d4 cxxbridge1$string$len + 606980
5   chromedriver                        0x0000000100609134 cxxbridge1$string$len + 322660
6   chromedriver                        0x0000000100609d84 cxxbridge1$string$len + 325812
7   chromedriver                        0x000000010098bf90 cxxbridge1$str$ptr + 1677160
8   chromedriver                        0x00000001009908fc cxxbridge1$str$ptr + 1695956
9   chromedriver                        0x00000001009714b8 cxxbridge1$str$ptr + 1567888
10  chromedriver         