In [1]:
# imports
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

In [2]:
df = pd.read_csv('../../data/youtube_data/skims_youtube_data.csv')

df.head(2)

Unnamed: 0,Title,Links,VideoLink
0,HUGE SKIMS HAUL | I SPENT OVER $700 ON SKIMS| ...,https://skims.com/products/soft-lounge-long-sl...,https://www.youtube.com/watch?v=Fc8uSovM7KM
1,Skims Try-on Haul &amp; Review | *New In* Marc...,https://skims.com/products/new-vintage-cropped...,https://www.youtube.com/watch?v=VJQWNVVCk6Y


In [3]:
def scrape_skims(url):
    try:
        # Set Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument('--headless')

        # Initialize the WebDriver with headless mode
        driver = webdriver.Chrome(options=chrome_options)
        
        # Open the webpage
        driver.get(url)

        if not driver.current_url.startswith('https://skims.com'):
            return {'item': 'Unsupported URL', 'url': url}

        # Wait for the specified time before clicking the interactive element
        time.sleep(3)

        # Find the interactive element
        # interactive_element_xpath = '//*[@id="essential"]/div[3]/div/div[6]/div[2]/button'
        interactive_element_xpath = '//button[h2[contains(text(), "Fit & Fabric")]]'

        interactive_element = driver.find_element(By.XPATH, interactive_element_xpath)

        driver.execute_script("arguments[0].scrollIntoView(true);", interactive_element)

        # in case "Sign Up" popup appears
        try:
            # get popup
            attentive_creative = driver.find_element(By.ID, "attentive_creative")

            # switch to the iframe
            driver.switch_to.frame(attentive_creative)
            
            # click on close button
            close_button_inside_iframe = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//*[@id="closeIconContainer"]')))
            close_button_inside_iframe.click()
            
            # switch back to the page
            driver.switch_to.default_content()
        except:
            pass

        # Click the interactive element
        interactive_element.click()

        # get all text inside "FIT & FABRIC" dropdown
        fit_fabric_dropdown_elem = driver.find_element(By.CLASS_NAME, 'flex.flex-col.items-center.justify-center.pdp-translation-12')

        # Get all the text within the div element
        fit_fabric_text = fit_fabric_dropdown_elem.text

        # get product name from url (since name on website isn't specific enough)
        product_name = url.split('products/')[1]
        product_name = product_name.split('?')[0]
        title = ' '.join(word.capitalize() for word in product_name.split('-'))

        materials = {}
        
        # get list of materials and their percentages
        fabric_percentages = [elem for elem in fit_fabric_text.split('\n') if '%' in elem][0]
        
        if ': ' in fabric_percentages:
            fabric_percentages.split(': ')[1]

        fabric_percentages = fabric_percentages.split(' / ')

        for material_comp in fabric_percentages:
            material_percentage = material_comp.split('% ') 
            percent = material_percentage[0]
            fabric = (material_percentage[1]).lower()
            materials[fabric] = percent

            final_data = {'item': title}
            final_data.update(materials)

        return final_data
        
    except:
        return {'item': 'No Data', 'url': url}
        
    finally:
        # Close the WebDriver
        driver.quit()

In [4]:
url = 'https://skims.com/products/boyfriend-boxer-lily'

material_info = scrape_skims(url)
print(material_info)

{'item': 'Boyfriend Boxer Lily', 'cotton': '47', 'modal': '48', 'spandex': '5'}


In [5]:
url = 'https://skims.com/products/fits-everybody-legging-onyx'

material_info = scrape_skims(url)
print(material_info)

{'item': 'Fits Everybody Legging Onyx', 'polyamide': '76', 'elastane': '24'}


In [6]:
def scrape_and_update(row):
    links_list = row["Links"].split('\n')
    scraped_data_combined = ""

    for link in links_list:
        scraped_data = scrape_skims(link)
        scraped_data_combined += str(scraped_data) + "\n"

    return scraped_data_combined

In [None]:
df['ScrapedData'] = df.apply(scrape_and_update, axis=1)

In [33]:
df.to_csv('materials_data/skims_materials.csv', index=False)