In [39]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from bs4 import BeautifulSoup
import requests
import time

In [40]:
# FIRST VERSION!!!
def scrape_alo(url):
    try:
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)
        page_source = driver.page_source
        driver.quit()
        soup = BeautifulSoup(page_source, "html.parser")
        title_element = soup.find("h1", class_="productTitle")
        if title_element:
            title = title_element.text
        
        data_element = soup.find("div", class_="fabrication")
        if data_element:
            contents = data_element.text

        product = [(title, contents)]

        return product
    
    except Exception as e:
        return f"An error occurred: {str(e)}"
    finally:
        driver.quit()

# URLs
urls = [
    "https://www.aloyoga.com/products/w9538r-airbrush-stream-lined-bra-tank-ivory-black?variant=41346126512308&disableCurrencyEstimate&gad_source=1&gclid=CjwKCAjww_iwBhApEiwAuG6ccL77JcerRoX7_JAHcpKdcfgONVm-BMBBTVNxENk7-9cIIDnlReDxGxoCaegQAvD_BwE",
    "https://www.aloyoga.com/products/w9715r-alosoft-all-night-tank-fog?variant=42377127395508&disableCurrencyEstimate&gad_source=1&gclid=EAIaIQobChMI97vtmsrthQMVGdDCBB1EIQzYEAQYASABEgLpoPD_BwE",
    "https://www.aloyoga.com/products/w51119r-airbrush-high-waist-7-8-bootcut-legging-black"
]

for url in urls:
    data = scrape_alo(url)
    print(data)

[('Airbrush Streamlined Bra Tank', 'fabrication\nMedium-compression signature Airbrush performance fabric\xa0\n87% Nylon, 13% Elastane\n\n')]
[('Alosoft All Night Tank', 'fabrication\nUltra-light, breathable signature Alosoft fabric\n\u200b\u200b87% Polyester, 13% Elastane\nMachine wash cold with like colors. Tumble dry low; low iron. Do not dry clean.\n\n')]
[('Airbrush High-Waist 7/8 Bootcut Legging', 'fabrication\nMedium compression signature Airbrush performance fabric\n4-Way stretch for a move-with-you feel\nFlat-locked seams for comfort\nMoisture-wicking & odor resistant\nFront-smoothing panel for commando comfort\nMachine wash cold on gentle cycle, inside out, with like colors. Do not wash with towels. Do not bleach. Tumble dry low.\n\n')]


In [41]:
# SECOND VERSION!
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

def scrape_alo(url):
    try:
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)

        # Extract title using XPath
        try:
            title_element = driver.find_element(By.XPATH, '//h1[contains(@class,"productTitle")]')
            title = title_element.text.strip().title()
        except NoSuchElementException:
            title = "Title Not Found"

        # Initialize variables to hold content and fabrication details
        contents = None
        fabrication = None

        # Try to find content details using XPath
        try:
            contents_element = driver.find_element(By.XPATH, '//*[@id="shopify-section-react-pdp"]/div[2]/div/div/div/div/div[1]/div/div/div/div/div/div[2]/div/div[2]/div/div/div/div[2]/div[4]/div[2]/span/ul')
            contents = contents_element.text.strip()
        except NoSuchElementException:
            contents = None

        # Try to find fabrication details using XPath
        try:
            fabrication_element = driver.find_element(By.XPATH, '//*[@id="REACT-PDP"]/div[5]/div[3]/div[2]/span/ul')
            fabrication = fabrication_element.text.strip()
        except NoSuchElementException:
            fabrication = None

        # Decide what to return based on the availability of contents and fabrication
        if contents and fabrication:
            result = contents  # both are available, prefer contents
        elif fabrication:
            result = fabrication  # only fabrication is available
        else:
            result = "Details not found"  # neither is found

        product = [(title, result)]
        return product
    
    except Exception as e:
        return f"An error occurred: {str(e)}"
    finally:
        if driver:
            driver.quit()

# URLs to scrape
urls = [
    "https://www.aloyoga.com/products/w9538r-airbrush-stream-lined-bra-tank-ivory-black?variant=41346126512308",
    "https://www.aloyoga.com/products/w9715r-alosoft-all-night-tank-fog?variant=42377127395508",
    "https://www.aloyoga.com/products/w51119r-airbrush-high-waist-7-8-bootcut-legging-black",
    "https://www.aloyoga.com/products/u3032rg-accolade-hoodie-white"
]

scraped_data = []
# Call the function to scrape each website
for url in urls:
    data = scrape_alo(url)
    scraped_data.append(data)
    print(data)

[('Airbrush Streamlined Bra Tank', 'Compression fabric for performance support\nSmoothing fit & matte finish\nFour-way stretch moves with you\nSoft & breathable with a cottony feel\n87% Nylon, 13% Elastane')]
[('Alosoft All Night Tank', 'Ultra-soft fabric for cozy comfort\nStretchy fit & soft, brushed finish\nFour-way stretch moves with you\nButtery soft & breathable with an airy feel\n87% Polyester, 13% Elastane')]
[('Airbrush High-Waist 7/8 Bootcut Legging', 'Compression fabric for performance support\nSmoothing fit & matte finish\nFour-way stretch moves with you\nSoft & breathable with a cottony feel\n87% Nylon, 13% Elastane')]
[('Title Not Found', 'Details not found')]


In [42]:
scraped_data

[[('Airbrush Streamlined Bra Tank',
   'Compression fabric for performance support\nSmoothing fit & matte finish\nFour-way stretch moves with you\nSoft & breathable with a cottony feel\n87% Nylon, 13% Elastane')],
 [('Alosoft All Night Tank',
   'Ultra-soft fabric for cozy comfort\nStretchy fit & soft, brushed finish\nFour-way stretch moves with you\nButtery soft & breathable with an airy feel\n87% Polyester, 13% Elastane')],
 [('Airbrush High-Waist 7/8 Bootcut Legging',
   'Compression fabric for performance support\nSmoothing fit & matte finish\nFour-way stretch moves with you\nSoft & breathable with a cottony feel\n87% Nylon, 13% Elastane')],
 [('Title Not Found', 'Details not found')]]

In [43]:
# FINAL VERSION!!
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

def scrape_alo(url):
    try:
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)

        wait = WebDriverWait(driver, 2)  

        try:
            title_element = wait.until(EC.presence_of_element_located((By.XPATH, '//h1[contains(@class,"productTitle")]')))
            title = title_element.text.strip().title()
        except TimeoutException:
            title = "Title Not Found"

        contents = None
        fabrication = None

        try:
            contents_element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="shopify-section-react-pdp"]/div[2]/div/div/div/div/div[1]/div/div/div/div/div/div[2]/div/div[2]/div/div/div/div[2]/div[4]/div[2]/span/ul')))
            contents = contents_element.text.strip()
        except TimeoutException:
            contents = None

        try:
            fabrication_element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="REACT-PDP"]/div[5]/div[3]/div[2]/span/ul')))
            fabrication = fabrication_element.text.strip()
        except TimeoutException:
            fabrication = None

        if contents and fabrication:
            result = contents  # both are available, prefer contents
        elif fabrication:
            result = fabrication  # only fabrication is available
        else:
            result = "Details not found"  # neither is found

        product = [(title, result)]
        return product
    
    except Exception as e:
        return f"An error occurred: {str(e)}"
    finally:
        if driver:
            driver.quit()

# URLs to scrape
urls = [
    "https://www.aloyoga.com/products/w9538r-airbrush-stream-lined-bra-tank-ivory-black?variant=41346126512308",
    "https://www.aloyoga.com/products/w9715r-alosoft-all-night-tank-fog?variant=42377127395508",
    "https://www.aloyoga.com/products/w51119r-airbrush-high-waist-7-8-bootcut-legging-black",
    "https://www.aloyoga.com/products/u3032rg-accolade-hoodie-white"
]

scraped_data 
for url in urls:
    data = scrape_alo(url)
    print(data)


[('Airbrush Streamlined Bra Tank', 'Compression fabric for performance support\nSmoothing fit & matte finish\nFour-way stretch moves with you\nSoft & breathable with a cottony feel\n87% Nylon, 13% Elastane')]
[('Alosoft All Night Tank', 'Ultra-soft fabric for cozy comfort\nStretchy fit & soft, brushed finish\nFour-way stretch moves with you\nButtery soft & breathable with an airy feel\n87% Polyester, 13% Elastane')]
[('Airbrush High-Waist 7/8 Bootcut Legging', 'Compression fabric for performance support\nSmoothing fit & matte finish\nFour-way stretch moves with you\nSoft & breathable with a cottony feel\n87% Nylon, 13% Elastane')]
[('Accolade Hoodie', 'French terry that’s smooth on outside and fleecy on the inside\n65% Cotton, 35% Polyester\nMachine wash separately on cold, gentle cycle. Tumble dry low, gentle cycle.')]


In [44]:
# USES OPTIMIZATION FOR SPEED, CONSIDER USING THIS WHEN SCRAPING!
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import concurrent.futures

def scrape_alo(url):
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options=options)
    try:
        driver.get(url)
        wait = WebDriverWait(driver, 2)  # Reduced wait time

        try:
            title_element = wait.until(EC.presence_of_element_located((By.XPATH, '//h1[contains(@class,"productTitle")]')))
            title = title_element.text.strip().title()
        except TimeoutException:
            title = "Title Not Found"

        contents = None
        fabrication = None

        try:
            contents_element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="shopify-section-react-pdp"]/div[2]/div/div/div/div/div[1]/div/div/div/div/div/div[2]/div/div[2]/div/div/div/div[2]/div[4]/div[2]/span/ul')))
            contents = contents_element.text.strip()
        except TimeoutException:
            contents = None

        try:
            fabrication_element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="REACT-PDP"]/div[5]/div[3]/div[2]/span/ul')))
            fabrication = fabrication_element.text.strip()
        except TimeoutException:
            fabrication = None

        if contents and fabrication:
            result = contents
        elif fabrication:
            result = fabrication
        else:
            result = "Details not found"

        return (title, result)
    except Exception as e:
        return ("Error", f"An error occurred: {str(e)}")
    finally:
        driver.quit()

urls = [
    "https://www.aloyoga.com/products/w9538r-airbrush-stream-lined-bra-tank-ivory-black?variant=41346126512308",
    "https://www.aloyoga.com/products/w9715r-alosoft-all-night-tank-fog?variant=42377127395508",
    "https://www.aloyoga.com/products/w51119r-airbrush-high-waist-7-8-bootcut-legging-black",
    "https://www.aloyoga.com/products/u3032rg-accolade-hoodie-white"
]

def main():
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        results = list(executor.map(scrape_alo, urls))
        for result in results:
            print(result)

if __name__ == "__main__":
    main()


('Airbrush Streamlined Bra Tank', 'Compression fabric for performance support\nSmoothing fit & matte finish\nFour-way stretch moves with you\nSoft & breathable with a cottony feel\n87% Nylon, 13% Elastane')
('Alosoft All Night Tank', 'Ultra-soft fabric for cozy comfort\nStretchy fit & soft, brushed finish\nFour-way stretch moves with you\nButtery soft & breathable with an airy feel\n87% Polyester, 13% Elastane')
('Airbrush High-Waist 7/8 Bootcut Legging', 'Compression fabric for performance support\nSmoothing fit & matte finish\nFour-way stretch moves with you\nSoft & breathable with a cottony feel\n87% Nylon, 13% Elastane')
('Accolade Hoodie', 'French terry that’s smooth on outside and fleecy on the inside\n65% Cotton, 35% Polyester\nMachine wash separately on cold, gentle cycle. Tumble dry low, gentle cycle.')


In [45]:
product_info = [
    ('Airbrush Streamlined Bra Tank', 'Compression fabric for performance support\nSmoothing fit & matte finish\nFour-way stretch moves with you\nSoft & breathable with a cottony feel\n87% Nylon, 13% Elastane'),
    ('Alosoft All Night Tank', 'Ultra-soft fabric for cozy comfort\nStretchy fit & soft, brushed finish\nFour-way stretch moves with you\nButtery soft & breathable with an airy feel\n87% Polyester, 13% Elastane'),
    ('Airbrush High-Waist 7/8 Bootcut Legging', 'Compression fabric for performance support\nSmoothing fit & matte finish\nFour-way stretch moves with you\nSoft & breathable with a cottony feel\n87% Nylon, 13% Elastane'),
    ('Accolade Hoodie', 'French terry that’s smooth on outside and fleecy on the inside\n65% Cotton, 35% Polyester\nMachine wash separately on cold, gentle cycle. Tumble dry low, gentle cycle.')
]

def parse_fabrication(description):
    # Extract the percentage and material type from the description
    import re
    pattern = re.compile(r'(\d+)%\s+([A-Za-z]+)')
    matches = pattern.findall(description)
    fabrication_dict = {}
    for match in matches:
        percentage, material = match
        fabrication_dict[material.lower()] = int(percentage)
    return fabrication_dict

product_dicts = []
for product in product_info:
    item_name, description = product
    # Create a dictionary for each item
    item_dict = {'item': item_name}
    item_dict.update(parse_fabrication(description))
    product_dicts.append(item_dict)

print(product_dicts)


[{'item': 'Airbrush Streamlined Bra Tank', 'nylon': 87, 'elastane': 13}, {'item': 'Alosoft All Night Tank', 'polyester': 87, 'elastane': 13}, {'item': 'Airbrush High-Waist 7/8 Bootcut Legging', 'nylon': 87, 'elastane': 13}, {'item': 'Accolade Hoodie', 'cotton': 65, 'polyester': 35}]


# BELOW WORKS!!!!

In [46]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import concurrent.futures
import re

def parse_products(product_info):
    
    pattern = re.compile(r'(\d+)%\s+([A-Za-z]+)')

    for item_name, description in product_info:
        matches = pattern.findall(description)
        item_dict = {'item': item_name}
        for percentage, material in matches:
            item_dict[material.lower()] = int(percentage)

    return item_dict

# # Example data
# product_info = [
#     ('Airbrush Streamlined Bra Tank', 'Compression fabric for performance support\nSmoothing fit & matte finish\nFour-way stretch moves with you\nSoft & breathable with a cottony feel\n87% Nylon, 13% Elastane'),
#     ('Alosoft All Night Tank', 'Ultra-soft fabric for cozy comfort\nStretchy fit & soft, brushed finish\nFour-way stretch moves with you\nButtery soft & breathable with an airy feel\n87% Polyester, 13% Elastane'),
#     ('Airbrush High-Waist 7/8 Bootcut Legging', 'Compression fabric for performance support\nSmoothing fit & matte finish\nFour-way stretch moves with you\nSoft & breathable with a cottony feel\n87% Nylon, 13% Elastane'),
#     ('Accolade Hoodie', 'French terry that’s smooth on outside and fleecy on the inside\n65% Cotton, 35% Polyester\nMachine wash separately on cold, gentle cycle. Tumble dry low, gentle cycle.')
# ]

# # Processing the data
# processed_data = parse_products(product_info)
# print(processed_data)

def scrape_alo(url):
    try:
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)

        wait = WebDriverWait(driver, 2)

        try:
            title_element = wait.until(EC.presence_of_element_located((By.XPATH, '//h1[contains(@class,"productTitle")]')))
            title = title_element.text.strip().title()
        except TimeoutException:
            title = "Title Not Found"

        contents = None
        fabrication = None

        try:
            contents_element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="shopify-section-react-pdp"]/div[2]/div/div/div/div/div[1]/div/div/div/div/div/div[2]/div/div[2]/div/div/div/div[2]/div[4]/div[2]/span/ul')))
            contents = contents_element.text.strip()
        except TimeoutException:
            contents = None

        try:
            fabrication_element = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="REACT-PDP"]/div[5]/div[3]/div[2]/span/ul')))
            fabrication = fabrication_element.text.strip()
        except TimeoutException:
            fabrication = None

        if contents and fabrication:
            result = contents  # both are available, prefer contents
        elif fabrication:
            result = fabrication  # only fabrication is available
        else:
            result = "Details not found"  # neither is found

        product = [(title, result)]
        return parse_products(product)
    
    except Exception as e:
        return f"An error occurred: {str(e)}"
    finally:
        if driver:
            driver.quit()

# URLs to scrape
urls = [
    "https://www.aloyoga.com/products/w9538r-airbrush-stream-lined-bra-tank-ivory-black?variant=41346126512308",
    "https://www.aloyoga.com/products/w9715r-alosoft-all-night-tank-fog?variant=42377127395508",
    "https://www.aloyoga.com/products/w51119r-airbrush-high-waist-7-8-bootcut-legging-black",
    "https://www.aloyoga.com/products/u3032rg-accolade-hoodie-white"
]

for url in urls:
    data = scrape_alo(url)
    print(data)


{'item': 'Airbrush Streamlined Bra Tank', 'nylon': 87, 'elastane': 13}
{'item': 'Alosoft All Night Tank', 'polyester': 87, 'elastane': 13}
{'item': 'Airbrush High-Waist 7/8 Bootcut Legging', 'nylon': 87, 'elastane': 13}
{'item': 'Accolade Hoodie', 'cotton': 65, 'polyester': 35}
