In [121]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from bs4 import BeautifulSoup
import requests
import time
import re
from selenium.webdriver.common.action_chains import ActionChains

In [139]:
# WORKS!!!! Scrapes the main content
def scrape_aritzia(url, wait_time=3):
    try:
        # Set Chrome options for headless mode
        user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
        chrome_options = Options()
        # chrome_options.add_argument("--headless")
        chrome_options.add_argument(f"user-agent={user_agent}")

        # Disabling images
        prefs = {"profile.managed_default_content_settings.images": 2}
        chrome_options.add_experimental_option("prefs", prefs)

        # Initialize the WebDriver with headless mode
        driver = webdriver.Chrome(options=chrome_options)
        
        # Open the webpage
        driver.get(url)

        # Extract the item title
        product_name_element = WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, 'h1.js-product-detail__product-name'))
        )
        product_name = product_name_element.text.strip().title() if product_name_element else None
        # print(product_name)

        # Wait for the specified time before clicking the interactive element
        time.sleep(wait_time)

        # Find the interactive element
        interactive_element_xpath = '//*[@id="primary"]/div[1]/div[1]/div[3]/div/ul/li[1]/a'
        interactive_element = driver.find_element(By.XPATH, interactive_element_xpath)
        interactive_element.click()

        # Wait for the loaded content to be visible
        # loaded_content_xpath = '//*[@id="pdp-panel__details"]/div/div[2]/ul'
        loaded_content_xpath = '//*[@id="pdp-panel__details"]/div'
        loaded_element = WebDriverWait(driver, wait_time).until(
            EC.visibility_of_element_located((By.XPATH, loaded_content_xpath))
        )
        
        # Once loaded, scrape the content
        dynamic_content = loaded_element.text.strip()
        # print("printed dynamic_content:", dynamic_content)

        # Extract fabric compositions
        content_regex = r"Content:\s*(.+)"
        match = re.search(content_regex, dynamic_content)
        if match:
            materials_text = match.group(1)
            # Handle semicolon: stop parsing after first semicolon if it exists
            semicolon_index = materials_text.find(';')
            if semicolon_index != -1:
                materials_text = materials_text[:semicolon_index]
            # materials_match = re.findall(r'(\d+)%\s*([^\s,;]+)', materials_text)
            materials_match = re.findall(r'(\d+)%\s*(.*?)(?:,|$)', materials_text)
        else:
            materials_match = []

        # print("printed materials_match:", materials_match)

        # Prepare dictionary to include materials
        materials_dict = {"item": product_name}
        for percentage, material in materials_match:
            clean_material = material.lower().replace("™", "").strip()
            materials_dict[clean_material] = int(percentage)

        return materials_dict
    
    except Exception as e:
        # print(f"An error occurred: {str(e)}")
        return {'item': 'No Data', 'url': url}
        
    finally:
        # Close the WebDriver
        driver.quit()

# Example usage:
# url = "https://www.aritzia.com/us/en/product/renewal-dress/117600006.html"
# url = "https://www.aritzia.com/us/en/product/flor-top/115882013.html"
# url = "https://www.aritzia.com/us/en/product/the-%2780s-comfy-denim-shirt/120515.html?dwvar_120515_color=32984"
# url = "https://www.aritzia.com/us/en/product/lodge-linen-pant/118269.html?dwvar_118269_color=21352"
# url = "https://www.aritzia.com/us/en/product/new-power-blazer/111931.html?dwvar_111931_color=6521"
# result = scrape_aritzia(url)
# if result:
#     print(result)

{'item': 'New Power Blazer', 'polyester': 61, 'lenzing ecovero viscose': 26, 'cotton': 7, 'elastane': 6}


In [140]:
urls = [
    "https://www.aritzia.com/us/en/product/lodge-linen-pant/118269.html?dwvar_118269_color=21352",
    "https://www.aritzia.com/us/en/product/renewal-dress/117600006.html",
    "https://www.aritzia.com/us/en/product/flor-top/115882013.html",
    "https://www.aritzia.com/us/en/product/new-power-blazer/111931.html?dwvar_111931_color=6521",
    "https://www.aritzia.com/us/en/product/the-%2780s-comfy-denim-shirt/120515.html?dwvar_120515_color=32984"
]

results = []
for url in urls:
    result = scrape_aritzia(url)
    if result:
        results.append(result)
    # else:
        #results.append({'item': 'No Data', 'url': url})

# Now, results contains all the dictionaries returned by the function
print(results)

[{'item': 'Lodge Linen Pant', 'tencel lyocell': 66, 'linen': 34}, {'item': 'Renewal Dress', 'polyester': 100}, {'item': 'Flor Top', 'lenzing ecovero viscose': 62, 'polyester': 38}, {'item': 'No Data', 'url': 'https://www.aritzia.com/us/en/product/new-power-blazer/111931.html?dwvar_111931_color=6521'}, {'item': "The '80S Comfy Denim Shirt", 'lyocell': 56, 'cotton': 29, 'linen': 15}]


# Below is a class

In [None]:
# CLASS
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re

class AritziaScraper:
    def __init__(self):
        self.driver = self.initialize_driver()

    def initialize_driver(self):
        # Set Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36")
        # Disabling images
        prefs = {"profile.managed_default_content_settings.images": 2}
        chrome_options.add_experimental_option("prefs", prefs)
        driver = webdriver.Chrome(options=chrome_options)
        return driver

    def scrape(self, url, wait_time=3):
        try:
            self.driver.get(url)
            product_name_element = WebDriverWait(self.driver, 10).until(
                EC.visibility_of_element_located((By.CSS_SELECTOR, 'h1.js-product-detail__product-name'))
            )
            product_name = product_name_element.text.strip().title() if product_name_element else None

            # Wait and click the interactive element
            time.sleep(wait_time)

            interactive_element_xpath = '//*[@id="primary"]/div[1]/div[1]/div[3]/div/ul/li[1]/a'
            interactive_element = self.driver.find_element(By.XPATH, interactive_element_xpath)
            time.sleep(wait_time)

            interactive_element.click()

            loaded_content_xpath = '//*[@id="pdp-panel__details"]/div'
            loaded_element = WebDriverWait(self.driver, wait_time).until(
                EC.visibility_of_element_located((By.XPATH, loaded_content_xpath))
            )
            dynamic_content = loaded_element.text.strip()

            time.sleep(wait_time)

            content_regex = r"Content:\s*(.+)"
            match = re.search(content_regex, dynamic_content)
            materials_text = match.group(1) if match else ""
            semicolon_index = materials_text.find(';')
            materials_text = materials_text[:semicolon_index] if semicolon_index != -1 else materials_text
            materials_match = re.findall(r'(\d+)%\s*(.*?)(?:,|$)', materials_text)

            time.sleep(wait_time)

            materials_dict = {"item": product_name}
            for percentage, material in materials_match:
                clean_material = material.lower().replace("™", "").strip()
                materials_dict[clean_material] = int(percentage)
            time.sleep(wait_time)
            return materials_dict

        except Exception as e:
            return {'item': 'No Data', 'url': url}

    def close_driver(self):
        self.driver.quit()

# Usage
scraper = AritziaScraper()


results = []
try:
    for url in urls:
        result = scraper.scrape(url)
        results.append(result)
finally:
    scraper.close_driver()

print(results)
