In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from lxml import html
from bs4 import BeautifulSoup
import requests
import time

In [24]:
def scrape_hollister(url):
    try:
        # Set Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        # Initialize the WebDriver
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)
        # Get the page source after interactions
        page_source = driver.page_source
        # Close the WebDriver once done to avoid multiple instances
        driver.quit()
        # Parse the page source with Beautiful Soup
        soup = BeautifulSoup(page_source, "html.parser")
        # Extract data using Beautiful Soup methods
        
        item = ""
        title_element = soup.find("h1", class_="product-title-component")
        if title_element:
            item = title_element.text
        data_element = soup.find("h4", class_="h4 fabric-care-mfe__label")
        if data_element:
            materials = data_element.text
            return process_item_data(item, materials)
        else:
            return "Data element not found on the page."
    except Exception as e:
        return f"An error occurred: {str(e)}"

# URL of the webpage you want to scrape
url = "https://www.hollisterco.com/shop/us/p/low-rise-dark-wash-baggy-jeans-56195470?categoryId=12552&faceout=model&seq=03"
# Call the function to scrape the website
data = scrape_hollister(url)
print(data)

{'item': 'Low-Rise Dark Wash Baggy Jeans', 'cotton': 100}


In [25]:
urls = [
    "https://www.hollisterco.com/shop/us/p/seamless-fabric-crew-baby-tee-55640321?categoryId=12552&faceout=model&seq=13",
    "https://www.hollisterco.com/shop/us/p/ultra-high-rise-dad-jeans-50414962?categoryId=12552&faceout=model&seq=04",
    "https://www.hollisterco.com/shop/us/p/hollister-sofia-side-smocked-maxi-dress-55640320?categoryId=12552&faceout=life&seq=02",
    "https://www.hollisterco.com/shop/us/p/easy-open-stitch-crochet-cardigan-55639328?categoryId=12627&faceout=model&seq=02"
]

In [26]:
scraped_data = []
for url in urls:
    data = scrape_hollister(url)
    scraped_data.append(data)
    print(data)

{'item': 'Soft Stretch Seamless Fabric Crew Baby Tee', 'polyester': 88, 'elastane': 12}
{'item': 'Ultra High-Rise Ripped Light Wash Dad Jeans', 'cotton': 99, 'elastane': 1}
{'item': 'Hollister Sofia Side-Smocked Maxi Dress', 'viscose': 100}
{'item': 'Easy Open-Stitch Crochet-Style Cardigan', 'cotton': 100}


In [20]:
scraped_data

[('Soft Stretch Seamless Fabric Crew Baby Tee',
  'Body:88% Polyester, 12% Elastane'),
 ('Ultra High-Rise Ripped Light Wash Dad Jeans',
  'Pocket Bag:80% Polyester, 20% Cotton / Body:99% Cotton, 1% Elastane'),
 ('Hollister Sofia Side-Smocked Maxi Dress',
  'Lining:100% Polyester / Elastic:70% Polyester, 30% Rubber / Body:100% Viscose'),
 ('Easy Open-Stitch Crochet-Style Cardigan', 'Body:100% Cotton')]

In [23]:
def process_item_data(item, materials):
    item_dict = {'item': item}
    
    # Process materials
    if 'Body:' in materials:
        components = materials.split('/')
        for component in components:
            component = component.strip()
            if component.startswith('Body:'):
                _, composition = component.split(':', 1)
                parts = composition.strip().split()
                
                for part in parts:
                    if '%' in part:
                        try:
                            percentage = int(part.split('%')[0])
                            material_index = parts.index(part) + 1
                            if material_index < len(parts):
                                material = parts[material_index].strip()
                                material = ''.join(char for char in material if char.isalnum() or char.isspace())
                                item_dict[material.lower()] = percentage
                        except ValueError:
                            continue
    else:
        item_dict['materials'] = "No body material info available."
        
    return item_dict