In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to get product data from the page
def get_product_data(soup):
    # Initialize empty variables for each field
    product_data = {
        "title": None,
        "description": None,
        "price": None,
        "image_url": None,
        "features": [],
        "specifications": []
    }

    # Extract product title
    title_tag = soup.find('h1')
    if title_tag:
        product_data['title'] = title_tag.text.strip()

    # Extract description
    description_tag = soup.find('p', property='description')
    if description_tag:
        product_data['description'] = description_tag.text.strip()

    # Extract image URL
    image_tag = soup.find('meta', property='og:image')
    if image_tag:
        product_data['image_url'] = image_tag['content']

    # Extract features
    features_section = soup.find('div', id='featurebenefits')
    if features_section:
        feature_items = features_section.find_all('div', class_='row')
        for item in feature_items:
            feature = item.get_text(separator=' ', strip=True)
            if feature:
                product_data['features'].append(feature)

    # Extract specifications
    specs_section = soup.find('div', id='specifications')
    if specs_section:
        spec_rows = specs_section.find_all('tr')
        for row in spec_rows:
            spec_title = row.find('td')
            spec_value = spec_title.find_next_sibling('td')
            if spec_title and spec_value:
                product_data['specifications'].append(f"{spec_title.text.strip()}: {spec_value.text.strip()}")

    return product_data

# Function to save data into a CSV file
def save_to_csv(data, filename):
    df = pd.DataFrame([data])
    df.to_csv(filename, index=False)

# Main function to run the crawler
def main():
    url = 'https://www.kaercher.com/us/home-garden/electric-pressure-washers.html'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
    }

    # Send a request to fetch the HTML content of the page
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'lxml')

    # Get the product data
    product_data = get_product_data(soup)

    # Save the data to a CSV file
    save_to_csv(product_data, 'kaercher_product.csv')

    print(f"Data saved to kaercher_product.csv.")

if __name__ == "__main__":
    main()


Data saved to kaercher_product.csv.


In [6]:
import requests
from bs4 import BeautifulSoup

# URL of the page to scrape
url = 'https://www.kaercher.com/us/home-garden/electric-pressure-washers.html'

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content of the page with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find all product categories by targeting the relevant class
categories = soup.find_all('div', class_='fc-image')

# Loop through each category and extract data
for category in categories:
    # Find the image link
    img_tag = category.find('img')
    img_url = img_tag['data-src'] if img_tag else 'No Image'
    
    # Find the product link
    a_tag = category.find('a')
    product_url = a_tag['href'] if a_tag else 'No URL'
    
    # Find the product title
    title = a_tag.text if a_tag else 'No Title'
    
    # Output the scraped data
    print(f"Product Title: {title}")
    print(f"Product URL: {product_url}")
    print(f"Image URL: {img_url}")
    print("-" * 30)



In [9]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

# Function to extract product data from the JSON embedded in the script tag
def extract_product_data(json_data):
    products = []
    clusters = json_data.get('clusters', [])
    
    for cluster in clusters:
        for product in cluster['products']:
            name = product.get('name', '')
            partnumber = product.get('partnumber', '')
            description = product.get('description', '')
            image_url = product.get('image', '')
            product_url = "https://www.kaercher.com" + product.get('url', '')
            # Append the data to the list
            products.append([name, partnumber, description, image_url, product_url])
    
    return products

# Function to scrape the page and extract product information
def scrape_products(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Find the <script> tag containing the JSON data
    script_tag = soup.find('script', {'type': 'application/json', 'data-kjs-setting': '_gist'})
    
    if script_tag:
        json_data = json.loads(script_tag.string)
        # Extract product data from JSON
        products = extract_product_data(json_data)
        return products
    else:
        print("No product data found in the script tag.")
        return []

# Function to save product data into a CSV file
def save_to_csv(products, filename='products.csv'):
    df = pd.DataFrame(products, columns=['Name', 'Part Number', 'Description', 'Image URL', 'Product URL'])
    df.to_csv(filename, index=False)

# Main function to run the scraper
def main():
    url = 'https://www.kaercher.com/us/home-garden/electric-pressure-washers.html'  # The target URL
    products = scrape_products(url)
    
    if products:
        save_to_csv(products)
        print(f"Data saved to products.csv. {len(products)} products found.")
    else:
        print("No products found.")

if __name__ == "__main__":
    main()


Data saved to products.csv. 20 products found.


In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to scrape product details and all images from a single product page
def scrape_product_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')

    # Extract product name and description
    try:
        name = soup.find('h1').text.strip()
    except AttributeError:
        name = "Name not found"

    try:
        description = soup.find('meta', {'name': 'description'})['content']
    except AttributeError:
        description = "Description not found"

    try:
        price = soup.find('span', class_='price').text.strip()
    except AttributeError:
        price = "Price not found"
    
    # Extract image URLs
    main_image_url = soup.find('meta', {'property': 'og:image'})['content'] if soup.find('meta', {'property': 'og:image'}) else "Main image not found"
    
    # Find all images in feature benefits section
    feature_images = [img['data-src'] for img in soup.select('div#featurebenefits img')]

    # Get additional images (e.g., in specifications)
    additional_images = [img['data-src'] for img in soup.select('div#specifications img')]

    # Combine all images
    all_images = [main_image_url] + feature_images + additional_images

    # Return a structured dictionary
    return {
        'Product URL': url,
        'Name': name,
        'Description': description,
        'Price': price,
        'Main Image URL': main_image_url,
        'Feature Images': ', '.join(feature_images),  # Join list into a single string
        'Additional Images': ', '.join(additional_images)
    }

# Function to scrape multiple product URLs and store the data
def scrape_products(urls):
    products_data = []
    for url in urls:
        product_data = scrape_product_page(url)
        products_data.append(product_data)
    return products_data

# Function to save product data to a CSV file
def save_to_csv(products, filename='products_with_images.csv'):
    df = pd.DataFrame(products)
    df.to_csv(filename, index=False)

# Main function to run the scraper
def main():
    # List of product URLs
    product_urls = [
        "https://www.kaercher.com/us/home-garden/electric-pressure-washers/k-5-premium-smart-control-13246830.html",
        "https://www.kaercher.com/us/home-garden/electric-pressure-washers/k-5-premium-smart-control-car-home-13246840.html",
        # Add more URLs as needed
    ]

    # Scrape product data from all URLs
    products = scrape_products(product_urls)
    
    # Save the data to CSV
    save_to_csv(products)
    print(f"Data saved to products_with_images.csv. {len(products)} products found.")

if __name__ == "__main__":
    main()



Data saved to products_with_images.csv. 2 products found.


In [13]:
#Downlad Products data with 
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to scrape product details and all tabbed sections from a single product page
def scrape_product_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')

    # Extract product name and description
    try:
        name = soup.find('h1').text.strip()
    except AttributeError:
        name = "Name not found"

    try:
        description = soup.find('meta', {'name': 'description'})['content']
    except AttributeError:
        description = "Description not found"

    try:
        scripts = soup.find_all('script', type='application/ld+json')
        for script in scripts:
            json_data = json.loads(script.string)
            if 'offers' in json_data:
            price = json_data['offers']['price']
        break
    else:
        price = "Price not found"
except (AttributeError, KeyError, TypeError, json.JSONDecodeError):
    price = "Price not found"
    
    # Extract image URLs
    main_image_url = soup.find('meta', {'property': 'og:image'})['content'] if soup.find('meta', {'property': 'og:image'}) else "Main image not found"
    
    # Find all images in feature benefits section
    feature_images = [img['data-src'] for img in soup.select('div#featurebenefits img')]

    # Extract tabbed section content
    try:
        description_section = soup.select_one('div#description').text.strip()
    except AttributeError:
        description_section = "Description section not found"
    
    try:
        features_section = soup.select_one('div#featurebenefits').text.strip()
    except AttributeError:
        features_section = "Features section not found"
    
    try:
        specifications_section = soup.select_one('div#specifications').text.strip()
    except AttributeError:
        specifications_section = "Specifications section not found"
    
    try:
        documents_section = soup.select_one('div#downloads').text.strip()
    except AttributeError:
        documents_section = "Documents section not found"
    
    try:
        videos_section = soup.select_one('div#videos').text.strip()
    except AttributeError:
        videos_section = "Videos section not found"
    
    try:
        application_section = soup.select_one('div#application').text.strip()
    except AttributeError:
        application_section = "Application section not found"
    
    try:
        accessories_section = soup.select_one('div#accessory').text.strip()
    except AttributeError:
        accessories_section = "Accessories section not found"
    
    try:
        detergents_section = soup.select_one('div#detergent').text.strip()
    except AttributeError:
        detergents_section = "Detergents section not found"
    
    try:
        parts_section = soup.select_one('div#spareparts').text.strip()
    except AttributeError:
        parts_section = "Parts section not found"
    
    try:
        ratings_section = soup.select_one('div#ratings').text.strip()
    except AttributeError:
        ratings_section = "Ratings section not found"

    # Return a structured dictionary
    return {
        'Product URL': url,
        'Name': name,
        'Description': description,
        'Price': price,
        'Main Image URL': main_image_url,
        'Feature Images': ', '.join(feature_images),  # Join list into a single string
        'Description Section': description_section,
        'Features Section': features_section,
        'Specifications Section': specifications_section,
        'Documents Section': documents_section,
        'Videos Section': videos_section,
        'Application Section': application_section,
        'Accessories Section': accessories_section,
        'Detergents Section': detergents_section,
        'Parts Section': parts_section,
        'Ratings Section': ratings_section
    }

# Function to scrape multiple product URLs and store the data
def scrape_products(urls):
    products_data = []
    for url in urls:
        product_data = scrape_product_page(url)
        products_data.append(product_data)
    return products_data

# Function to save product data to a CSV file
def save_to_csv(products, filename='products_with_tabs.csv'):
    df = pd.DataFrame(products)
    df.to_csv(filename, index=False)

# Main function to run the scraper
def main():
    # List of product URLs
    product_urls = [
        "https://www.kaercher.com/us/home-garden/electric-pressure-washers/k-4-power-control-13240450.html",
        "https://www.kaercher.com/us/home-garden/electric-pressure-washers/k-3-power-control-16761090.html",
        # Add more URLs as needed
    ]

    # Scrape product data from all URLs
    products = scrape_products(product_urls)
    
    # Save the data to CSV
    save_to_csv(products)
    print(f"Data saved to products_with_tabs.csv. {len(products)} products found.")

if __name__ == "__main__":
    main()


IndentationError: expected an indented block (3754435580.py, line 27)

In [15]:
import requests

# URL of the sitemap
url = 'https://www.kaercher.com/us/sitemap1.xml'

# Send an HTTP GET request to download the file
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Save the file locally
    with open('sitemap1.xml', 'wb') as file:
        file.write(response.content)
    print("Sitemap downloaded successfully!")
else:
    print(f"Failed to download the sitemap. Status code: {response.status_code}")


Sitemap downloaded successfully!


In [18]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

# Step 1: Fetch the sitemap XML
sitemap_url = 'https://www.kaercher.com/us/sitemap1.xml'
response = requests.get(sitemap_url)

# Check if the request was successful
if response.status_code == 200:
    sitemap_xml = response.content

    # Step 2: Parse the XML data
    root = ET.fromstring(sitemap_xml)

    # Initialize a list to hold product information
    product_data = []

    # Step 3: Extract product URLs or relevant data
    # The sitemap XML is typically structured with <url> and <loc> tags
    for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
        loc = url.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc').text
        product_data.append({'Product URL': loc})

    # Step 4: Convert to a DataFrame and export as CSV
    df = pd.DataFrame(product_data)

    # Exporting DataFrame to CSV
    output_file = 'kaercher_products.csv'  # Modify this path if needed
    df.to_csv(output_file, index=False)

    # Display the first few rows of the DataFrame to verify
    import ace_tools as tools; tools.display_dataframe_to_user(name="Karcher Products", dataframe=df)

    print(f"Product data has been exported to {output_file}.")
else:
    print(f"Failed to fetch the sitemap. Status code: {response.status_code}")


ModuleNotFoundError: No module named 'ace_tools'

In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

def scrape_product_page(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Check for request issues
        soup = BeautifulSoup(response.text, 'lxml')

        # Extract product name and description
        try:
            name = soup.find('h1').text.strip()
        except AttributeError:
            name = "Name not found"

        try:
            description = soup.find('meta', {'name': 'description'})['content']
        except AttributeError:
            description = "Description not found"

        # Extract price from JSON-LD script
        try:
            scripts = soup.find_all('script', type='application/ld+json')
            for script in scripts:
                json_data = json.loads(script.string)
                if 'offers' in json_data:
                    price = json_data['offers']['price']
                    break
            else:
                price = "Price not found"
        except (AttributeError, KeyError, TypeError, json.JSONDecodeError):
            price = "Price not found"

        # Extract image URLs
        main_image_url = soup.find('meta', {'property': 'og:image'})['content'] if soup.find('meta', {'property': 'og:image'}) else "Main image not found"
        
        # Find all images in feature benefits section
        feature_images = [img['data-src'] for img in soup.select('div#featurebenefits img')]

        # Extract tabbed section content (similar logic for other sections)
        try:
            description_section = soup.select_one('div#description').text.strip()
        except AttributeError:
            description_section = "Description section not found"
        
        try:
            features_section = soup.select_one('div#featurebenefits').text.strip()
        except AttributeError:
            features_section = "Features section not found"
        
        try:
            specifications_section = soup.select_one('div#specifications').text.strip()
        except AttributeError:
            specifications_section = "Specifications section not found"
        
        try:
            documents_section = soup.select_one('div#downloads').text.strip()
        except AttributeError:
            documents_section = "Documents section not found"
        
        try:
            videos_section = soup.select_one('div#videos').text.strip()
        except AttributeError:
            videos_section = "Videos section not found"
        
        try:
            application_section = soup.select_one('div#application').text.strip()
        except AttributeError:
            application_section = "Application section not found"
        
        try:
            accessories_section = soup.select_one('div#accessory').text.strip()
        except AttributeError:
            accessories_section = "Accessories section not found"
        
        try:
            detergents_section = soup.select_one('div#detergent').text.strip()
        except AttributeError:
            detergents_section = "Detergents section not found"
        
        try:
            parts_section = soup.select_one('div#spareparts').text.strip()
        except AttributeError:
            parts_section = "Parts section not found"
        
        try:
            ratings_section = soup.select_one('div#ratings').text.strip()
        except AttributeError:
            ratings_section = "Ratings section not found"

        return {
            'Product URL': url,
            'Name': name,
            'Description': description,
            'Price': price,
            'Main Image URL': main_image_url,
            'Feature Images': ', '.join(feature_images),
            'Description Section': description_section,
            'Features Section': features_section,
            'Specifications Section': specifications_section,
            'Documents Section': documents_section,
            'Videos Section': videos_section,
            'Application Section': application_section,
            'Accessories Section': accessories_section,
            'Detergents Section': detergents_section,
            'Parts Section': parts_section,
            'Ratings Section': ratings_section
        }

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None


def scrape_products(urls):
    products_data = []
    for url in urls:
        product_data = scrape_product_page(url)
        if product_data:
            products_data.append(product_data)
    return products_data


def save_to_csv(products, filename='products_with_tabs.csv'):
    df = pd.DataFrame(products)
    df.to_csv(filename, index=False)


def main():
    product_urls = [
        "https://www.kaercher.com/us/home-garden/electric-pressure-washers/k-4-power-control-13240450.html",
        "https://www.kaercher.com/us/home-garden/electric-pressure-washers/k-3-power-control-16761090.html"
    ]

    products = scrape_products(product_urls)
    save_to_csv(products)
    print(f"Data saved to products_with_tabs.csv. {len(products)} products found.")


if __name__ == "__main__":
    main()


Data saved to products_with_tabs.csv. 2 products found.


In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

def scrape_product_page(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')

        # Extract product name and description
        try:
            name = soup.find('h1').text.strip()
        except AttributeError:
            name = "Name not found"

        try:
            description = soup.find('meta', {'name': 'description'})['content']
        except AttributeError:
            description = "Description not found"

        # Extract price from JSON-LD script
        try:
            scripts = soup.find_all('script', type='application/ld+json')
            for script in scripts:
                json_data = json.loads(script.string)
                if 'offers' in json_data:
                    price = json_data['offers']['price']
                    break
            else:
                price = "Price not found"
        except (AttributeError, KeyError, TypeError, json.JSONDecodeError):
            price = "Price not found"

        # Extract image URLs
        main_image_url = soup.find('meta', {'property': 'og:image'})['content'] if soup.find('meta', {'property': 'og:image'}) else "Main image not found"
        
        # Find all images in feature benefits section
        feature_images = [img['data-src'] for img in soup.select('div#featurebenefits img')]

        # Extract specifications
        specifications = {}
        try:
            spec_table_rows = soup.select('div#specifications table tr')
            for row in spec_table_rows:
                cells = row.find_all('td')
                if len(cells) == 2:
                    spec_key = cells[0].text.strip()
                    spec_value = cells[1].text.strip()
                    specifications[spec_key] = spec_value
        except AttributeError:
            pass

        # Extract tabbed section content (same logic for other sections)
        try:
            description_section = soup.select_one('div#description').text.strip()
        except AttributeError:
            description_section = "Description section not found"
        
        try:
            features_section = soup.select_one('div#featurebenefits').text.strip()
        except AttributeError:
            features_section = "Features section not found"
        
        try:
            specifications_section = soup.select_one('div#specifications').text.strip()
        except AttributeError:
            specifications_section = "Specifications section not found"
        
        try:
            documents_section = soup.select_one('div#downloads').text.strip()
        except AttributeError:
            documents_section = "Documents section not found"
        
        try:
            videos_section = soup.select_one('div#videos').text.strip()
        except AttributeError:
            videos_section = "Videos section not found"
        
        try:
            application_section = soup.select_one('div#application').text.strip()
        except AttributeError:
            application_section = "Application section not found"
        
        try:
            accessories_section = soup.select_one('div#accessory').text.strip()
        except AttributeError:
            accessories_section = "Accessories section not found"
        
        try:
            detergents_section = soup.select_one('div#detergent').text.strip()
        except AttributeError:
            detergents_section = "Detergents section not found"
        
        try:
            parts_section = soup.select_one('div#spareparts').text.strip()
        except AttributeError:
            parts_section = "Parts section not found"
        
        try:
            ratings_section = soup.select_one('div#ratings').text.strip()
        except AttributeError:
            ratings_section = "Ratings section not found"

        # Create dictionary with product data
        product_data = {
            'Product URL': url,
            'Name': name,
            'Description': description,
            'Price': price,
            'Main Image URL': main_image_url,
            'Feature Images': ', '.join(feature_images),
            'Description Section': description_section,
            'Features Section': features_section,
            'Specifications Section': specifications_section,
            'Documents Section': documents_section,
            'Videos Section': videos_section,
            'Application Section': application_section,
            'Accessories Section': accessories_section,
            'Detergents Section': detergents_section,
            'Parts Section': parts_section,
            'Ratings Section': ratings_section
        }

        # Add specifications as separate columns
        product_data.update(specifications)

        return product_data

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None


def scrape_products(urls):
    products_data = []
    for url in urls:
        product_data = scrape_product_page(url)
        if product_data:
            products_data.append(product_data)
    return products_data


def save_to_csv(products, filename='products_with_tabs.csv'):
    df = pd.DataFrame(products)
    df.to_csv(filename, index=False)


def main():
    product_urls = [
        "https://www.kaercher.com/us/accessories/suction-bar-carpet-cleaning-50332750.html",
        "https://www.kaercher.com/us/accessories/squeegee-47770800.html"
    ]

    products = scrape_products(product_urls)
    save_to_csv(products)
    print(f"Data saved to products_with_tabs.csv. {len(products)} products found.")


if __name__ == "__main__":
    main()


Data saved to products_with_tabs.csv. 2 products found.


In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

def scrape_product_page(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')

        # Extract product name and description
        try:
            name = soup.find('h1').text.strip()
        except AttributeError:
            name = "Name not found"

        try:
            description = soup.find('meta', {'name': 'description'})['content']
        except AttributeError:
            description = "Description not found"

        # Initialize price and part number with default values
        price = "Price not found"
        part_number = "Part number not found"

        # Look for the structured JSON-LD data
        try:
            scripts = soup.find_all('script', type='application/ld+json')
            for script in scripts:
                json_data = json.loads(script.string)
                
                # Look for price under the 'offers' key
                if 'offers' in json_data and 'price' in json_data['offers']:
                    price = json_data['offers']['price']
                
                # Look for part number in 'sku' or 'mpn'
                if 'sku' in json_data:
                    part_number = json_data['sku']
                elif 'mpn' in json_data:
                    part_number = json_data['mpn']
                
                # If both price and part_number are found, break
                if price != "Price not found" and part_number != "Part number not found":
                    break

        except (AttributeError, KeyError, TypeError, json.JSONDecodeError):
            pass

        # Extract image URLs
        main_image_url = soup.find('meta', {'property': 'og:image'})['content'] if soup.find('meta', {'property': 'og:image'}) else "Main image not found"
        
        # Find all images in feature benefits section
        feature_images = [img['data-src'] for img in soup.select('div#featurebenefits img')]

        # Extract specifications
        specifications = {}
        try:
            spec_table_rows = soup.select('div#specifications table tr')
            for row in spec_table_rows:
                cells = row.find_all('td')
                if len(cells) == 2:
                    spec_key = cells[0].text.strip()
                    spec_value = cells[1].text.strip()
                    specifications[spec_key] = spec_value
        except AttributeError:
            pass

        # Extract tabbed section content (similar logic for other sections)
        try:
            description_section = soup.select_one('div#description').text.strip()
        except AttributeError:
            description_section = "Description section not found"
        
        try:
            features_section = soup.select_one('div#featurebenefits').text.strip()
        except AttributeError:
            features_section = "Features section not found"
        
        try:
            specifications_section = soup.select_one('div#specifications').text.strip()
        except AttributeError:
            specifications_section = "Specifications section not found"
        
        try:
            documents_section = soup.select_one('div#downloads').text.strip()
        except AttributeError:
            documents_section = "Documents section not found"
        
        try:
            videos_section = soup.select_one('div#videos').text.strip()
        except AttributeError:
            videos_section = "Videos section not found"
        
        try:
            application_section = soup.select_one('div#application').text.strip()
        except AttributeError:
            application_section = "Application section not found"
        
        try:
            accessories_section = soup.select_one('div#accessory').text.strip()
        except AttributeError:
            accessories_section = "Accessories section not found"
        
        try:
            detergents_section = soup.select_one('div#detergent').text.strip()
        except AttributeError:
            detergents_section = "Detergents section not found"
        
        try:
            parts_section = soup.select_one('div#spareparts').text.strip()
        except AttributeError:
            parts_section = "Parts section not found"
        
        try:
            ratings_section = soup.select_one('div#ratings').text.strip()
        except AttributeError:
            ratings_section = "Ratings section not found"

        # Create dictionary with product data
        product_data = {
            'Product URL': url,
            'Name': name,
            'Part Number': part_number,  # Adding Part Number
            'Description': description,
            'Price': price,
            'Main Image URL': main_image_url,
            'Feature Images': ', '.join(feature_images),
            'Description Section': description_section,
            'Features Section': features_section,
            'Specifications Section': specifications_section,
            'Documents Section': documents_section,
            'Videos Section': videos_section,
            'Application Section': application_section,
            'Accessories Section': accessories_section,
            'Detergents Section': detergents_section,
            'Parts Section': parts_section,
            'Ratings Section': ratings_section
        }

        # Add specifications as separate columns
        product_data.update(specifications)

        return product_data

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None


def scrape_products(urls):
    products_data = []
    for url in urls:
        product_data = scrape_product_page(url)
        if product_data:
            products_data.append(product_data)
    return products_data


def save_to_csv(products, filename='products_with_tabs.csv'):
    df = pd.DataFrame(products)
    df.to_csv(filename, index=False)


def main():
    product_urls = [
        "https://www.kaercher.com/us/home-garden/window-vac.html",
        "https://www.kaercher.com/us/home-garden/window-vac/wv-6-plus-white-16337450.html"
    ]

    products = scrape_products(product_urls)
    save_to_csv(products)
    print(f"Data saved to products_with_tabs.csv. {len(products)} products found.")


if __name__ == "__main__":
    main()


Data saved to products_with_tabs.csv. 2 products found.


In [74]:
#Product download without catalog
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

def scrape_product_page(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'lxml')

        # Extract product name and description
        try:
            name = soup.find('h1').text.strip()
        except AttributeError:
            name = "Name not found"

        try:
            description = soup.find('meta', {'name': 'description'})['content']
        except AttributeError:
            description = "Description not found"

        # Initialize price and part number with default values
        price = "Price not found"
        part_number = "Part number not found"

        # Look for the structured JSON-LD data
        try:
            scripts = soup.find_all('script', type='application/ld+json')
            for script in scripts:
                json_data = json.loads(script.string)
                
                # Look for price under the 'offers' key
                if 'offers' in json_data and 'price' in json_data['offers']:
                    price = json_data['offers']['price']
                
                # Look for part number in 'sku' or 'mpn'
                if 'sku' in json_data:
                    part_number = json_data['sku']
                elif 'mpn' in json_data:
                    part_number = json_data['mpn']
                
                # If both price and part_number are found, break
                if price != "Price not found" and part_number != "Part number not found":
                    break

        except (AttributeError, KeyError, TypeError, json.JSONDecodeError):
            pass

        # Extract image URLs
        main_image_url = soup.find('meta', {'property': 'og:image'})['content'] if soup.find('meta', {'property': 'og:image'}) else "Main image not found"
        
        # Find all images in feature benefits section
        feature_images = [img['data-src'] for img in soup.select('div#featurebenefits img')]

        # Extract specifications
        specifications = {}
        try:
            spec_table_rows = soup.select('div#specifications table tr')
            for row in spec_table_rows:
                cells = row.find_all('td')
                if len(cells) == 2:
                    spec_key = cells[0].text.strip()
                    spec_value = cells[1].text.strip()
                    specifications[spec_key] = spec_value
        except AttributeError:
            pass

        # Create dictionary with product data
        product_data = {
            'Product URL': url,
            'Name': name,
            'Part Number': part_number,  # Adding Part Number
            'Description': description,
            'Price': price,
            'Main Image URL': main_image_url,
            'Feature Images': ', '.join(feature_images),
                  }

        # Add specifications as separate columns
        product_data.update(specifications)

        return product_data

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None


def scrape_products(urls):
    products_data = []
    for url in urls:
        product_data = scrape_product_page(url)
        if product_data:
            products_data.append(product_data)
    return products_data


def save_to_csv(products, filename='products_with_tabs.csv'):
    df = pd.DataFrame(products)
    df.to_csv(filename, index=False)


def main():
    product_urls = [
        "https://www.kaercher.com/us/home-garden/electric-pressure-washers/k-4-power-control-13240450.html",
        "https://www.kaercher.com/us/home-garden/electric-pressure-washers/k-3-power-control-16761090.html"
    ]

    products = scrape_products(product_urls)
    save_to_csv(products)
    print(f"Data saved to products_with_tabs.csv. {len(products)} products found.")


if __name__ == "__main__":
    main()


Data saved to products_with_tabs.csv. 2 products found.


In [72]:
#Product Download with pdf catalogs
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import json

def scrape_product_page(url):
    try:
        # Setup Selenium with Chrome driver
        chrome_options = Options()
        chrome_options.add_argument('--headless')  # Run Chrome in headless mode
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        
        service = Service('C:\\Users\\amir.emami\\AppData\\Local\\Temp\\Rar$EXa36532.5574.rartemp\\chromedriver-win32\\chromedriver.exe')

        driver = webdriver.Chrome(service=service, options=chrome_options)
        
        driver.get(url)
        
        # Introduce delay to simulate human-like page load and interaction
        time.sleep(5)  # Wait for 5 seconds, simulating human wait for the page to load

        # Get the page source and process it with BeautifulSoup
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'lxml')

        # Close the browser session
        driver.quit()

        # Extract product name and description
        try:
            name = soup.find('h1').text.strip()
        except AttributeError:
            name = "Name not found"

        try:
            description = soup.find('meta', {'name': 'description'})['content']
        except AttributeError:
            description = "Description not found"

        # Initialize price and part number with default values
        price = "Price not found"
        part_number = "Part number not found"

        # Look for the structured JSON-LD data
        try:
            scripts = soup.find_all('script', type='application/ld+json')
            for script in scripts:
                json_data = json.loads(script.string)
                
                # Look for price under the 'offers' key
                if 'offers' in json_data and 'price' in json_data['offers']:
                    price = json_data['offers']['price']
                
                # Look for part number in 'sku' or 'mpn'
                if 'sku' in json_data:
                    part_number = json_data['sku']
                elif 'mpn' in json_data:
                    part_number = json_data['mpn']
                
                # If both price and part_number are found, break
                if price != "Price not found" and part_number != "Part number not found":
                    break

        except (AttributeError, KeyError, TypeError, json.JSONDecodeError):
            pass

        # Extract image URLs
        main_image_url = soup.find('meta', {'property': 'og:image'})['content'] if soup.find('meta', {'property': 'og:image'}) else "Main image not found"
        
        # Find all images in feature benefits section
        feature_images = [img['data-src'] for img in soup.select('div#featurebenefits img')]

        # Extract documents (Product Brochure, Operating Instructions)
        product_brochure_url = "Product Brochure not found"
        operating_instruction_url_1 = "Operating instructions not found"
        operating_instruction_url_2 = "Operating instructions not found"

        try:
            documents_section = soup.select('div.fc-document')
            for document in documents_section:
                doc_title = document.select_one('div.fc-title h6').text.strip()
                doc_link = document.select_one('a.trk-download')['href']

                if "Product Brochure" in doc_title:
                    product_brochure_url = doc_link
                elif "Operating instructions" in doc_title and ".pdf" in doc_link:
                    operating_instruction_url_1 = doc_link
                elif "Operating instructions" in doc_title and ".html" in doc_link:
                    operating_instruction_url_2 = doc_link

        except AttributeError:
            pass

        # Extract specifications
        specifications = {}
        try:
            spec_table_rows = soup.select('div#specifications table tr')
            for row in spec_table_rows:
                cells = row.find_all('td')
                if len(cells) == 2:
                    spec_key = cells[0].text.strip()
                    spec_value = cells[1].text.strip()
                    specifications[spec_key] = spec_value
        except AttributeError:
            pass

        # Create dictionary with product data
        product_data = {
            'Product URL': url,
            'Name': name,
            'Part Number': part_number,  # Adding Part Number
            'Description': description,
            'Price': price,
            'Main Image URL': main_image_url,
            'Feature Images': ', '.join(feature_images),
            'Product Brochure URL': product_brochure_url,
            'Operating Instructions PDF URL': operating_instruction_url_1,
            'Operating Instructions HTML URL': operating_instruction_url_2
        }

        # Add specifications as separate columns
        product_data.update(specifications)

        return product_data

    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

def scrape_products(urls):
    products_data = []
    for url in urls:
        product_data = scrape_product_page(url)
        if product_data:
            products_data.append(product_data)
    return products_data

def save_to_csv(products, filename='products_with_tabs.csv'):
    df = pd.DataFrame(products)
    df.to_csv(filename, index=False)

def main():
    product_urls = [
        "https://www.kaercher.com/us/home-garden/electric-pressure-washers/k-4-power-control-13240450.html",
        "https://www.kaercher.com/us/home-garden/electric-pressure-washers/k-3-power-control-16761090.html"
    ]

    products = scrape_products(product_urls)
    save_to_csv(products)
    print(f"Data saved to products_with_tabs.csv. {len(products)} products found.")

if __name__ == "__main__":
    main()


Data saved to products_with_tabs.csv. 2 products found.
