In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from bs4 import BeautifulSoup
import requests
import time

In [16]:
def scrape_alo(url):
    try:
        # Set Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        # Initialize the WebDriver
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)
        
        # Wait for the data element to be present on the page
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "fabrication")))
        
        # Get the page source after interactions
        page_source = driver.page_source
        
        # Close the WebDriver once done to avoid multiple instances
        driver.quit()
        
        # Parse the page source with Beautiful Soup
        soup = BeautifulSoup(page_source, "html.parser")
        
        # Extract data using Beautiful Soup methods
        data_title = soup.find("h1", class_="productTitle")
        data_element = soup.find("div", class_="fabrication")
        
        if data_element:
            return data_title.text, data_element.text
        else:
            return "Data element not found on the page."
    except Exception as e:
        return f"An error occurred: {str(e)}"


In [14]:
df = pd.read_csv("../../data/youtube_data/alo_youtube_data.csv")
top_row = df.iloc[4]
links_list = top_row["Links"].split('\n')

for link in links_list:
    print(f"Scraping data from link: {link}")
    scraped_data = scrape_alo(link)
    print(scraped_data + "\n")
    time.sleep(3)

Scraping data from link: https://rstyle.me/cz-n/hgjsvqd3q47
fabrication
Seamless cable knit
Removeable cups
Machine wash separately on cold, gentle cycle. Tumble dry low, gentle cycle. Do not iron. Do not dry clean.



Scraping data from link: https://rstyle.me/cz-n/hgjsy5d3q47
fabrication
Airy poly-spandex blend
Machine wash cold with like colors. Tumble dry low; low iron. Do not dry clean.



Scraping data from link: https://rstyle.me/cz-n/hgjs28d3q47
fabrication
Smooth Airbrush & airy mesh
Corset-style structured panels



Scraping data from link: https://rstyle.me/cz-n/hgjs5kd3q47
fabrication
Super-soft, ribbed jersey
Henley placket detail
Logo band at ribs



Scraping data from link: https://rstyle.me/cz-n/hgjs7qd3q47
fabrication
Lightweight, slightly swishy, stretch-infused fabric
86% Polyester, 14% Spandex



Scraping data from link: https://rstyle.me/cz-n/hgjwrmd3q47
fabrication
Medium compression signature Airbrush performance fabric
4-Way stretch for a move-with-you feel
Flat

In [17]:
url = "https://rstyle.me/cz-n/hgjsvqd3q47"

scrape_alo(url)

('Seamless Cable Knit Bra',
 'fabrication\nSeamless cable knit\nRemoveable cups\nMachine wash separately on cold, gentle cycle. Tumble dry low, gentle cycle. Do not iron. Do not dry clean.\n\n')