In [1]:
import requests
from bs4 import BeautifulSoup
import re


In [2]:

# URL of the product page
url = "https://www.bestbuy.com/site/ge-top-control-built-in-dishwasher-with-sanitize-cycle-and-dry-boost-52-dba-stainless-steel/6474683.p?skuId=6474683"

# Send a request to get the page content
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'}
response = requests.get(url, headers=headers)

# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, 'lxml')

# Extracting visible text content
page_content = soup.get_text(separator=' ')  # Extract visible text from the page

# Define patterns for specific e-commerce platforms
site_patterns = [
    {'site': 'amazon.com', 'regex': re.compile(r'/\/(?:dp|gp\/product)\/([A-Z0-9]{10})(\/|\?|$)/'), 'type': 'Amazon ASIN'},
    {'site': 'ebay.com', 'regex': re.compile(r'/\/(?:p|itm)\/(\d+)(?:[\/?]|$)/'), 'type': 'eBay Product ID'},
    {'site': 'costco.com', 'regex': re.compile(r'(?:.+)?\.product\.(\d+)\.html\?(.*)$'), 'type': 'Costco Product ID'},
    {'site': 'walmart.com', 'regex': re.compile(r'/\/ip\/[^\/]+\/(\d+)(?:[\/?]|$)/'), 'type': 'Walmart Product ID'},
    {'site': 'aliexpress.com', 'regex': re.compile(r'/item\/(\d+)\.html(\/|\?|$)/'), 'type': 'AliExpress Product ID'},
    {'site': 'bestbuy.com', 'regex': re.compile(r'skuId=(\d+)(\/|\?|$)'), 'type': 'BestBuy SKU'},
    {'site': 'target.com', 'regex': re.compile(r'/\/p\/[^\/]+\/-\/A-(\d+)(?:[\/?#]|$)/'), 'type': 'Target Product ID'},
    {'site': 'newegg.com', 'regex': re.compile(r'(?:\/p\/|Item=)([A-Z0-9]+)(?:[\/?]|$)'), 'type': 'Newegg Product ID'},
    {'site': 'temu.com', 'regex': re.compile(r'/-g-(\d+)(?:\.html|[\/?]|$)/'), 'type': 'Temu Product ID'},
    {'site': 'rakuten.com', 'regex': re.compile(r'/\/product\/(\d+)(\/|\?|$)/'), 'type': 'Rakuten Product ID'},
    {'site': 'flipkart.com', 'regex': re.compile(r'\?pid=([A-Z0-9]+)'), 'type': 'Flipkart Product ID'},
    {'site': 'etsy.com', 'regex': re.compile(r'/listing\/(\d+)(\/|\?|$)/'), 'type': 'Etsy Listing ID'},
    {'site': 'homedepot.com', 'regex': re.compile(r'/\/p\/(?:[^\/]+\/)?(\d+)(?:[\/?]|$)/'), 'type': 'Home Depot Product ID'},
    {'site': 'lowes.com', 'regex': re.compile(r'(\d+)(\/|\?|$)'), 'type': 'Lowes Product ID'},
    {'site': 'wayfair.com', 'regex': re.compile(r'.*\/pdp\/.*-([a-zA-Z0-9]+)\.html$'), 'type': 'Wayfair Product ID'},
    {'site': 'overstock.com', 'regex': re.compile(r'product\.html\?productId=(\d+)(\/|\?|$)'), 'type': 'Overstock Product ID'},
    {'site': 'bhphotovideo.com', 'regex': re.compile(r'/(\d{7}-[A-Z0-9]+)\//'), 'type': 'B&H Photo Product ID'},
    {'site': 'neimanmarcus.com', 'regex': re.compile(r'/\/p\/([^\/?#&]+)/'), 'type': 'Neiman Marcus Product ID'},
    {'site': 'macys.com', 'regex': re.compile(r'[?&]ID=(\d+)'), 'type': 'Macy\'s Product ID'},
    {'site': 'kohls.com', 'regex': re.compile(r'/\/prd-(\d+)(?!\/|\?)/'), 'type': 'Kohl\'s Product ID'},
]

# Define regular expressions for universal product identifiers (EAN, GTIN, UPC)
id_patterns = {
    'EAN-13': re.compile(r'\b\d{13}\b'),    # EAN-13: exactly 13 digits
    'GTIN': re.compile(r'\b\d{14}\b'),      # GTIN: exactly 14 digits
    'UPC': re.compile(r'\b\d{12}\b'),       # UPC: exactly 12 digits
    'EAN-8': re.compile(r'\b\d{8}\b'),      # EAN-8: exactly 8 digits
}

# Function to search for product IDs
def find_product_ids(content, patterns, id_patterns):
    found_product_ids = []
    
    # Loop over each e-commerce platform pattern and search in URL content
    for pattern in patterns:
        matches = pattern['regex'].findall(content)
        for match in matches:
            found_product_ids.append(f'{pattern["type"]}: {match}')

    # Loop over each ID pattern and search in the visible text content
    for id_type, regex in id_patterns.items():
        matches = regex.findall(content)
        for match in matches:
            found_product_ids.append(f'{id_type}: {match}')
    
    # Generic pattern for any sequence of 4 or more digits (e.g., SKU)
    generic_regex = re.compile(r'\b\d{4,}\b')
    generic_matches = generic_regex.findall(content)
    for match in generic_matches:
        found_product_ids.append(f'Generic Product ID: {match}')
    
    return found_product_ids

# Combine URL and visible page content (if URL scraping is relevant)
full_content = url + "\n" + page_content

# Find product IDs in the visible text and URL
product_ids = find_product_ids(full_content, site_patterns, id_patterns)

# Print the found product IDs
if product_ids:
    print("Found Product Identifiers:")
    for pid in product_ids:
        print(pid)
else:
    print("No product identifiers found.")

Found Product Identifiers:
Generic Product ID: 6474683
Generic Product ID: 6474683
