In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from lxml import html
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd

In [18]:
def process_item_data(item, materials):
    item_dict = {'item': item}
    
    # Process materials
    if 'Body:' in materials:
        components = materials.split('/')
        for component in components:
            component = component.strip()
            if component.startswith('Body:'):
                _, composition = component.split(':', 1)
                parts = composition.strip().split()
                
                for part in parts:
                    if '%' in part:
                        try:
                            percentage = int(part.split('%')[0])
                            material_index = parts.index(part) + 1
                            if material_index < len(parts):
                                material = parts[material_index].strip()
                                material = ''.join(char for char in material if char.isalnum() or char.isspace())
                                item_dict[material.lower()] = percentage
                        except ValueError:
                            continue
    else:
        item_dict['materials'] = "No body material info available."
        
    return item_dict

In [14]:
def scrape_hollister(url):
    try:
        # Set Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        # Initialize the WebDriver
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)
        
        # Wait for the page to be loaded and redirected if necessary
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

        # Optionally check if URL has changed after redirection
        final_url = driver.current_url
        print(f"Final URL after redirection: {final_url}")
        
        # Get the page source after interactions
        page_source = driver.page_source
        print(page_source)

        # Close the WebDriver once done to avoid multiple instances
        driver.quit()
        # Parse the page source with Beautiful Soup
        soup = BeautifulSoup(page_source, "html.parser")
        # Extract data using Beautiful Soup methods
        
        item = ""
        title_element = soup.find("h1", class_="product-title-component")
        if title_element:
            item = title_element.text
        data_element = soup.find("h4", class_="h4 fabric-care-mfe__label")
        if data_element:
            materials = data_element.text
            return process_item_data(item, materials)
        else:
            return "Data element not found on the page."
    except:
        return {'item': 'No Data', 'url': url}

# URL of the webpage you want to scrape
# url = "https://www.hollisterco.com/shop/us/p/low-rise-dark-wash-baggy-jeans-56195470?categoryId=12552&faceout=model&seq=03"
url = "https://go.magik.ly/ml/1z6qr/"
# Call the function to scrape the website
data = scrape_hollister(url)
print(data)

Final URL after redirection: https://go.magik.ly/ml/1z6qr/
<html><head>
    <title>Hollister</title>
    <meta name="robots" content="noindex,nofollow">
    <meta content="4;url=https://imp.i255443.net/xMWxR?u=https%3A%2F%2Fwww.hollisterco.com%2Fshop%2Fus%2Fp%2Fdark-wash-straight-jeans-52320927&amp;subId1=66356580b98173092652e0ad&amp;sharedid=66356580b98173092652e0ad" http-equiv="refresh">
	<script async="" src="https://www.google-analytics.com/analytics.js"></script><script type="text/javascript">
		!function(){var t;t=window.XMLHttpRequest?new XMLHttpRequest:new ActiveXObject("Microsoft.XMLHTTP"),t.onreadystatechange=function(){if(4==t.readyState&&200==t.status){var n=t.responseText,r=JSON&&JSON.parse(n);r.purl?window.setTimeout(function(){window.location.replace(r.purl)},700):true&&window.setTimeout(function(){window.location.replace('https://imp.i255443.net/xMWxR?u=https%3A%2F%2Fwww.hollisterco.com%2Fshop%2Fus%2Fp%2Fdark-wash-straight-jeans-52320927&subId1=66356580b98173092652e0ad&

## Scraping below

In [6]:
df = pd.read_csv('../../data/youtube_data/hollister_youtube_data.csv')

df.head(2)

Unnamed: 0,Title,Links,VideoLink
0,HOLLISTER BACK TO SCHOOL HAUL * TRY-ON *,https://www.hollisterco.com/shop/us/p/low-rise...,https://www.youtube.com/watch?v=VH5ItHzPnzE
1,MEN&#39;S HOLLISTER TRY-ON | Gifts for HIM 202...,https://youtu.be/K214j156bMY\nhttps://go.magik...,https://www.youtube.com/watch?v=uVbDD1CQngY


In [1]:
def scrape_and_update(row):
    links_list = row["Links"].split('\n')
    scraped_data_combined = ""

    for link in links_list:
        scraped_data = scrape_hollister(link)
        scraped_data_combined += str(scraped_data) + "\n"
        print(scraped_data)
        time.sleep(3)
        print()

    return scraped_data_combined

In [16]:
df['ScrapedData'] = df.apply(scrape_and_update, axis=1)

In [13]:
df.head()

Unnamed: 0,Title,Links,VideoLink,ScrapedData
0,HOLLISTER BACK TO SCHOOL HAUL * TRY-ON *,https://www.hollisterco.com/shop/us/p/low-rise...,https://www.youtube.com/watch?v=VH5ItHzPnzE,Data element not found on the page.\nData elem...
1,MEN&#39;S HOLLISTER TRY-ON | Gifts for HIM 202...,https://youtu.be/K214j156bMY\nhttps://go.magik...,https://www.youtube.com/watch?v=uVbDD1CQngY,Data element not found on the page.\nData elem...
2,"CUTE AND CASUAL HOLLISTER HAUL | Summer 2024, ...",https://rstyle.me/+r7tiva_joSh9ZSUMhB_T-A\nhtt...,https://www.youtube.com/watch?v=QkSjTXhotxw,Data element not found on the page.\nData elem...
3,Spring Hollister Haul | Affordable Basics For ...,https://rstyle.me/+Me9wZ_UjuV4xc1MFQZtwBw\nhtt...,https://www.youtube.com/watch?v=gzVIZB1L6Bs,{'item': 'Easy Terry Off-the-Shoulder Sweatshi...
4,HUGE summer clothing haul! + try on! || hollis...,https://us.brandymelville.com/products/bella-r...,https://www.youtube.com/watch?v=IHTJRvxWJuk,Data element not found on the page.\nData elem...


In [7]:
df.to_csv('materials_data/hollister_materials.csv', index=False)

In [23]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

def scrape_hollister(url):
    try:
        # Set Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--enable-javascript")  # Ensure JS is enabled
        
        # Initialize the WebDriver
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)
        
        # Wait for possible redirects and JavaScript execution
        time.sleep(4)  # Adjust time based on observation, ideally use smarter wait

        # Checking final URL and wait for body tag to ensure load completion
        final_url = driver.current_url
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        # print(f"Final URL after redirection: {final_url}")
        
        # Get the page source after interactions
        page_source = driver.page_source
        
        # Close the WebDriver once done to avoid multiple instances
        driver.quit()
        
        # Parse the page source with Beautiful Soup
        soup = BeautifulSoup(page_source, "html.parser")
        
        # Extract data using Beautiful Soup methods
        item = ""
        title_element = soup.find("h1", class_="product-title-component")
        if title_element:
            item = title_element.text.strip()
        else:
            item = "None"
        
        materials = ""
        data_element = soup.find("h4", class_="h4 fabric-care-mfe__label")
        if data_element:
            materials = data_element.text.strip()
        
        return process_item_data(item, materials)
    except Exception as e:
        print(f"Error during scraping: {str(e)}")
        return {'item': 'No Data', 'url': url}

# URL of the webpage you want to scrape
url = "https://go.magik.ly/ml/1z6qr/"  # Example shortened URL
# Call the function to scrape the website
data = scrape_hollister(url)
print(data)


{'item': 'Dark Wash Straight Jeans', 'cotton': 95, 'polyester': 4, 'elastane': 1}


In [25]:
def scrape_and_update(row):
    links_list = row["Links"].split('\n')
    scraped_data_combined = ""

    for link in links_list:
        print(f"Scraping data from link: {link}")
        scraped_data = scrape_hollister(link)
        if scraped_data:
            scraped_data_combined += str(scraped_data) + "\n"
            print(scraped_data)
        time.sleep(3)
        print()

    return scraped_data_combined

In [26]:
df['ScrapedData'] = df.apply(scrape_and_update, axis=1)

Scraping data from link: https://www.hollisterco.com/shop/us/p/low-rise-light-wash-baggy-jeans-53527368
{'item': 'None', 'materials': 'No body material info available.'}

Scraping data from link: https://www.hollisterco.com/shop/us/p/feel-good-oversized-full-zip-hoodie-53686825?seq=09
{'item': 'None', 'materials': 'No body material info available.'}

Scraping data from link: https://www.hollisterco.com/shop/us/p/ultra-high-rise-black-dad-jeans-49943883
{'item': 'Ultra High-Rise Black Dad Jeans', 'cotton': 98, 'elastane': 2}

Scraping data from link: https://www.hollisterco.com/shop/us/p/short-sleeve-smocked-bust-babydoll-top-53734320?seq=03
{'item': 'None', 'materials': 'No body material info available.'}

Scraping data from link: https://www.hollisterco.com/shop/us/p/seamless-fabric-square-neck-baby-tee-53504330?seq=14
{'item': 'Seamless Fabric Square-Neck Baby Tee', 'polyester': 88, 'elastane': 12}

Scraping data from link: https://depop.com/tbird101
{'item': 'None', 'materials': 'No

In [27]:
df.head()

Unnamed: 0,Title,Links,VideoLink,ScrapedData
0,HOLLISTER BACK TO SCHOOL HAUL * TRY-ON *,https://www.hollisterco.com/shop/us/p/low-rise...,https://www.youtube.com/watch?v=VH5ItHzPnzE,"{'item': 'None', 'materials': 'No body materia..."
1,MEN&#39;S HOLLISTER TRY-ON | Gifts for HIM 202...,https://youtu.be/K214j156bMY\nhttps://go.magik...,https://www.youtube.com/watch?v=uVbDD1CQngY,"{'item': 'None', 'materials': 'No body materia..."
2,"CUTE AND CASUAL HOLLISTER HAUL | Summer 2024, ...",https://rstyle.me/+r7tiva_joSh9ZSUMhB_T-A\nhtt...,https://www.youtube.com/watch?v=QkSjTXhotxw,"{'item': 'Poplin Wide-Leg Sleep Pants', 'cotto..."
3,Spring Hollister Haul | Affordable Basics For ...,https://rstyle.me/+Me9wZ_UjuV4xc1MFQZtwBw\nhtt...,https://www.youtube.com/watch?v=gzVIZB1L6Bs,{'item': 'Easy Terry Off-the-Shoulder Sweatshi...
4,HUGE summer clothing haul! + try on! || hollis...,https://us.brandymelville.com/products/bella-r...,https://www.youtube.com/watch?v=IHTJRvxWJuk,"{'item': 'None', 'materials': 'No body materia..."


In [28]:
df.to_csv('materials_data/hollister_materials2.csv', index=False)