<a href="https://colab.research.google.com/github/nik-popov/coach-parser-notebook/blob/main/coach_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
def parse_product_tiles(html_content, base_url="https://www.coach.com"):
    """Parse HTML content to extract product information, including thumbnail URL.

    Args:
        html_content (str): HTML content to parse.
        base_url (str): Base URL for resolving relative image URLs.

    Returns:
        list: List of dictionaries containing product details.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    product_tiles = soup.find_all('div', class_=re.compile('product-tile.*'))
    products = []

    for tile in product_tiles:
        product = {}

        # Extract product name
        name_elem = tile.find('p', class_='chakra-text css-as1hzn')
        product['name'] = name_elem.text.strip() if name_elem else ' '

        # Extract product URL
        link_elem = tile.find('a', class_='css-avqw6d')
        href = link_elem.get('href') if link_elem else None
        if href:
            product['product_url'] = urljoin(base_url, href)
        else:
            product['product_url'] = ' '

        # Extract item_id from class (e.g., product-tile-CV933 -> CV933)
        class_list = tile.get('class', [])
        item_id_match = next((cls.split('product-tile-')[1] for cls in class_list if cls.startswith('product-tile-') and len(cls.split('-')) > 2), None)
        product['item_id'] = item_id_match if item_id_match else ' '

        # Extract variation_id from frp in URL (e.g., frp=CV933+IMXAQ -> IMXAQ)
        variation_id = ' '
        if href:
            frp_match = re.search(r'frp=([^+]+)\+([^&]+)', href)
            if frp_match:
                variation_id = frp_match.group(2)
        product['variation_id'] = variation_id

        # Extract comparable value (original price)
        comparable_elem = tile.find('span', {'data-qa': 'txt_comparable_value_price'})
        comparable_value = comparable_elem.text.strip() if comparable_elem else ' '
        product['comparable_value'] = comparable_value

        # Extract sale price
        sale_price_elem = tile.find('span', class_=re.compile('salesPrice.*'))
        sale_price = sale_price_elem.text.strip() if sale_price_elem else ' '
        product['sale_price'] = sale_price

        # Calculate discount percentage if comparable and sale prices are available
        discount_percentage = ' '
        if comparable_value != ' ' and sale_price != ' ':
            try:
                comp_val = float(re.sub(r'[^\d.]', '', comparable_value))
                sale_val = float(re.sub(r'[^\d.]', '', sale_price))
                if comp_val > 0:
                    discount = ((comp_val - sale_val) / comp_val) * 100
                    discount_percentage = f"{int(round(discount))}%"
            except ValueError:
                pass
        product['discount_percentage'] = discount_percentage

        # Extract additional discount (not present in sample, keep for compatibility)
        additional_discount_elem = tile.find('span', class_='plpprimarytag wk47memorialday')
        product['additional_discount'] = additional_discount_elem.text.strip() if additional_discount_elem else ' '

        # Extract thumbnail URL (primary product image)
        thumbnail_div = tile.find('div', class_='product-thumbnail plpv3 css-1gasjii')
        img_tag = thumbnail_div.find('img', class_='chakra-image css-klg1v') if thumbnail_div else None
        if img_tag and img_tag.get('src'):
            img_url = img_tag['src']
            if not img_url.startswith(('http://', 'https://')):
                img_url = urljoin(base_url, img_url)
            product['image_url'] = img_url
        else:
            product['image_url'] = ' '

        # Extract description (not typically present in tile; placeholder or extract alt text as fallback)
        description = ' '
        if img_tag and img_tag.get('alt'):
            alt_text = img_tag['alt']
            # Extract potential description from alt, e.g., after the name
            if product['name'] in alt_text:
                description = alt_text.split(product['name'], 1)[-1].strip(',').strip()
        product['description'] = description

        products.append(product)

    return products


def print_product_info(products):
    for product in products:
        print(f"Product: {product['name']}")
        print(f"Item ID: {product['item_id']}")
        print(f"Variation ID: {product['variation_id']}")
        print(f"Product URL: {product['product_url']}")
        print(f"Comparable Value: {product['comparable_value']}")
        print(f"Sale Price: {product['sale_price']}")
        print(f"Discount: {product['discount_percentage']}")
        print(f"Additional Discount: {product['additional_discount']}")
        print(f"Description: {product['description']}")
        print("-" * 50)
def load_html_from_txt(file_path):
    """Load HTML content from a text file.

    Args:
        file_path (str): Path to the text file containing HTML content.

    Returns:
        str: HTML content as a string, or None if an error occurs.

    Raises:
        FileNotFoundError: If the specified file does not exist.
        IOError: If there's an error reading the file.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        return html_content
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except IOError as e:
        print(f"Error reading file: {e}")
        return None
import csv

def save_to_csv(products, csv_file_path):
    """Save product information to a CSV file.

    Args:
        products (list): List of dictionaries containing product details.
        csv_file_path (str): Path to the output CSV file.

    Returns:
        bool: True if saving was successful, False otherwise.
    """
    try:
        with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = [
                'Name', 'Item_ID', 'Variation_ID', 'Product_URL', 'Comparable_Value',
                'Sale_Price', 'Discount_Percentage', 'Additional_Discount',
                'Image_URL', 'Description'
            ]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()
            for product in products:
                writer.writerow({
                    'Name': product.get('name', ' '),
                    'Item_ID': product.get('item_id', ' '),
                    'Variation_ID': product.get('variation_id', ' '),
                    'Product_URL': product.get('product_url', ' '),
                    'Comparable_Value': product.get('comparable_value', ' '),
                    'Sale_Price': product.get('sale_price', ' '),
                    'Discount_Percentage': product.get('discount_percentage', ' '),
                    'Additional_Discount': product.get('additional_discount', ' '),
                    'Image_URL': product.get('image_url', ' '),
                    'Description': product.get('description', ' ')
                })
        print(f"Data successfully saved to {csv_file_path}")
        return True
    except IOError as e:
        print(f"Error writing to CSV file: {e}")
        return False

In [None]:
if __name__ == "__main__":
    # Path to your text file containing HTML
    input_file_path = "/content/coach-category.txt"

    output_csv_path = "coach_outlet_bags_8-5-25.csv"

    # Load HTML content from text file
    html_content = load_html_from_txt(input_file_path)

    if html_content:
        # Parse product tiles
        products = parse_product_tiles(html_content)

        if products:
            # Save to CSV
            save_to_csv(products, output_csv_path)

            # Optional: Print product info for verification
            print_product_info(products)
        else:
            print("No products found in the HTML content.")

Data successfully saved to coach_outlet_bags_8-5-25.csv
Product: Teri Shoulder Bag In Signature Canvas
Item ID: CV933
Variation ID: IMXAQ
Product URL: https://www.coach.com/products/teri-shoulder-bag-in-signature-canvas/CV933.html?frp=CV933+IMXAQ
Comparable Value:  
Sale Price: $219
Discount:  
Additional Discount:  
Description: Shoulder Bag,Metal,Tag Embellishment,Logo,Embossed,Casual,Brown,Front View
--------------------------------------------------
Product: City Tote Bag In Signature Canvas
Item ID: CV976
Variation ID: IMXDM
Product URL: https://www.coach.com/products/city-tote-bag-in-signature-canvas/CV976.html?frp=CV976+IMXDM
Comparable Value: $398
Sale Price: $149   - Sales Price    - Original Price
Discount: 63%
Additional Discount:  
Description: Tote,Color Block,Embossed,Piping,Metal,Tag Embellishment,Logo,Casual,Beige,Front View
--------------------------------------------------
Product: Laurel Shoulder Bag
Item ID: CR148
Variation ID: IMBLK
Product URL: https://www.coach.c

# New Section