In [2]:
from bs4 import BeautifulSoup
import csv
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

def scrape_with_requests_first(url):
    """Try scraping with requests first (faster and more stable)"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response.text
    except Exception as e:
        print(f"Requests failed: {e}. Trying Selenium...")
        return None

def scrape_with_selenium(url):
    """Fallback to Selenium with proper configuration"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
    
    driver = None
    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        driver.set_page_load_timeout(30)
        
        print("Loading page with Selenium...")
        driver.get(url)
        
        # Wait for the table to load
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'table')))
        
        time.sleep(3)
        return driver.page_source
        
    except Exception as e:
        print(f"Selenium error: {e}")
        return None
    finally:
        if driver:
            driver.quit()

def debug_table_structure(html_content):
    """Debug the table structure to understand the layout"""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    print("=== DEBUGGING TABLE STRUCTURE ===\n")
    
    # Find all tables
    tables = soup.find_all('table')
    print(f"Found {len(tables)} tables on the page")
    
    for i, table in enumerate(tables):
        print(f"\n--- TABLE {i+1} ---")
        print(f"Classes: {table.get('class', 'None')}")
        print(f"ID: {table.get('id', 'None')}")
        
        # Check for caption
        caption = table.find('caption')
        if caption:
            print(f"Caption: {caption.get_text(strip=True)}")
        
        # Get all rows
        rows = table.find_all('tr')
        print(f"Number of rows: {len(rows)}")
        
        # Show first few rows structure
        for j, row in enumerate(rows[:5]):  # Show first 5 rows
            cols = row.find_all(['td', 'th'])
            print(f"  Row {j+1}: {len(cols)} columns")
            for k, col in enumerate(cols[:3]):  # Show first 3 columns
                text = col.get_text(strip=True)[:50]  # Truncate long text
                print(f"    Col {k+1}: '{text}'")
        
        if len(rows) > 5:
            print(f"  ... and {len(rows) - 5} more rows")
    
    # Look for common specification patterns
    print("\n=== LOOKING FOR SPECIFICATION PATTERNS ===")
    
    # Look for divs or sections that might contain specs
    spec_containers = soup.find_all(['div', 'section'], class_=lambda x: x and any(
        word in str(x).lower() for word in ['spec', 'detail', 'feature', 'characteristic']
    ))
    
    if spec_containers:
        print(f"Found {len(spec_containers)} potential specification containers")
        for i, container in enumerate(spec_containers[:3]):
            print(f"Container {i+1}: {container.get('class')}")
    
    # Look for definition lists (dl, dt, dd)
    dl_elements = soup.find_all('dl')
    if dl_elements:
        print(f"Found {len(dl_elements)} definition lists")
        for i, dl in enumerate(dl_elements[:2]):
            dt_elements = dl.find_all('dt')
            dd_elements = dl.find_all('dd')
            print(f"  DL {i+1}: {len(dt_elements)} terms, {len(dd_elements)} definitions")

def extract_specifications_enhanced(html_content):
    """Enhanced extraction with multiple strategies"""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    data = []
    
    # Strategy 1: Look for tables
    tables = soup.find_all('table')
    for table in tables:
        rows = table.find_all('tr')
        for row in rows:
            cols = row.find_all(['td', 'th'])
            if len(cols) >= 2:
                label = cols[0].get_text(separator=' ', strip=True)
                value = cols[1].get_text(separator=' ', strip=True)
                if label and value and len(label) > 2 and len(value) > 0:
                    data.append(('Table', label, value))
    
    # Strategy 2: Look for definition lists
    dl_elements = soup.find_all('dl')
    for dl in dl_elements:
        dt_elements = dl.find_all('dt')
        dd_elements = dl.find_all('dd')
        for dt, dd in zip(dt_elements, dd_elements):
            label = dt.get_text(strip=True)
            value = dd.get_text(strip=True)
            if label and value:
                data.append(('Definition List', label, value))
    
    # Strategy 3: Look for key-value pairs in divs
    spec_patterns = [
        ('div', 'spec'),
        ('div', 'feature'),
        ('div', 'detail'),
        ('span', 'spec'),
        ('p', 'spec')
    ]
    
    for tag, class_keyword in spec_patterns:
        elements = soup.find_all(tag, class_=lambda x: x and class_keyword in str(x).lower())
        for elem in elements:
            text = elem.get_text(strip=True)
            if ':' in text:
                parts = text.split(':', 1)
                if len(parts) == 2:
                    label = parts[0].strip()
                    value = parts[1].strip()
                    if label and value:
                        data.append((f'{tag.upper()} ({class_keyword})', label, value))
    
    return data

def main():
    url = "https://www.hbkworld.com/en/products/instruments/handheld/sound-level-meters/type-2245/2245-exhaust-noise"
    
    print("Starting web scraping...")
    
    # Try requests first
    html_content = scrape_with_requests_first(url)
    
    # Fallback to Selenium if requests fails
    if not html_content:
        html_content = scrape_with_selenium(url)
    
    if not html_content:
        print("Failed to retrieve page content")
        return
    
    # Debug the page structure
    debug_table_structure(html_content)
    
    # Try enhanced extraction
    print("\n=== TRYING ENHANCED EXTRACTION ===")
    data = extract_specifications_enhanced(html_content)
    
    if data:
        print(f"\nFound {len(data)} potential specifications:")
        for source, label, value in data[:10]:  # Show first 10
            print(f"{source}: {label} -> {value}")
        
        # Save to CSV
        filename = "debug_specifications.csv"
        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['Source', 'Specification', 'Value'])
            writer.writerows(data)
        
        print(f"\nSaved all {len(data)} specifications to {filename}")
    else:
        print("No specifications found with any method")
        
        # Save the HTML for manual inspection
        with open('debug_page.html', 'w', encoding='utf-8') as f:
            f.write(html_content)
        print("Saved page HTML as 'debug_page.html' for manual inspection")

if __name__ == "__main__":
    main()

Starting web scraping...
=== DEBUGGING TABLE STRUCTURE ===

Found 5 tables on the page

--- TABLE 1 ---
Classes: ['table', 'datatable-table']
ID: None
Number of rows: 4
  Row 1: 4 columns
    Col 1: ''
    Col 2: ''
    Col 3: ''
  Row 2: 4 columns
    Col 1: ''
    Col 2: ''
    Col 3: ''
  Row 3: 4 columns
    Col 1: ''
    Col 2: ''
    Col 3: ''
  Row 4: 4 columns
    Col 1: ''
    Col 2: ''
    Col 3: ''

--- TABLE 2 ---
Classes: ['all-models-table', 'hidden']
ID: models-all-models-35998bb201-table
Number of rows: 4
  Row 1: 5 columns
    Col 1: 'Code'
    Col 2: 'Actions'
    Col 3: 'Price'
  Row 2: 5 columns
    Col 1: '-2245-X-I-'
    Col 2: 'Not sellable online'
    Col 3: ''
  Row 3: 5 columns
    Col 1: '-2245-X-L-'
    Col 2: 'Not sellable online'
    Col 3: ''
  Row 4: 5 columns
    Col 1: '-2245-X-LC-'
    Col 2: 'Not sellable online'
    Col 3: ''

--- TABLE 3 ---
Classes: ['catego-filelist']
ID: None
Number of rows: 3
  Row 1: 3 columns
    Col 1: 'B&K 2245 Sound Level 