In [2]:
# import statements
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from bs4 import BeautifulSoup
import requests
import time
import re

In [3]:
df = pd.read_csv("../../data/youtube_data/shein_youtube_data.csv")
df.head(2)

Unnamed: 0,Title,Links,VideoLink
0,HUGE SHEIN MIDSIZE SPRING HAUL 2024 - Vacation...,https://bit.ly/3vWbA8e\nhttps://bit.ly/3TUp7p7...,https://www.youtube.com/watch?v=HIXMm9JaQkk
1,SHEIN Home Hacks #shein #sheinhaul #shorts,https://bit.ly/4aK9wPD\nhttps://bit.ly/3xpaUy\...,https://www.youtube.com/watch?v=NFDq_zFTa2w


In [32]:
def scrape_shein(url):
    try:
        # Set Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        # Initialize the WebDriver
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)

        if not (driver.current_url.startswith('https://us.shein.com') or driver.current_url.startswith('https://www.shein.co') or driver.current_url.startswith('https://m.shein.com')):
            return {'item': 'Unsupported URL', 'url': url}

        # Get the page source after interactions
        page_source = driver.page_source
        # # Close the WebDriver once done to avoid multiple instances
        # driver.quit()
        # Parse the page source with Beautiful Soup
        soup = BeautifulSoup(page_source, "html.parser")

        if driver.current_url.startswith('https://m.shein.com'):
            # title
            title_element = soup.find("span", class_="detail-title-text fsp-element")
            title = title_element.text.strip()

            # get XPath of clickable description
            interactive_element_xpath = '//*[@id="detail-view"]/div[2]/div[2]/div[1]/div/div[2]/div/div[2]/div'
            interactive_element = driver.find_element(By.XPATH, interactive_element_xpath)

            # scroll to the element to ensure it's in view
            driver.execute_script("arguments[0].scrollIntoView(true);", interactive_element)

            # click on the description
            driver.execute_script("arguments[0].click();", interactive_element)

            # wait for the page to load
            driver.implicitly_wait(10)

            # get description elements
            description_elements_class = 'goods-attr__list-text-li'
            description_elements = driver.find_elements(By.CLASS_NAME, description_elements_class)

            # get composition
            composition = [element.text for element in description_elements if 'Composition' in element.text and '%' in element.text]

            fabric_percentages = (composition.split(': ')[1]).split(', ')
        else:
            # get name of product
            title_element = soup.find("h1", class_="product-intro__head-name fsp-element")
            title = title_element.text.strip()
            
            # Find the div with class "key" containing "Composition: "
            pattern = re.compile(r"Composition: | Pants & Tops Composition: | Tops & Pants Composition: | Pants Composition: | Tops Composition: ")
            composition_key = soup.find("div", class_="key", string=pattern)

            if composition_key:
                # Get the next sibling of the "key" div, which contains the composition information
                composition_val = composition_key.find_next_sibling("div", class_="val")
                if composition_val != None:
                    # Extract the composition information
                    composition_text = composition_val.text.strip()
                    # print(composition_text)
                    fabric_percentages = composition_text.split(', ')
                    # print(fabric_percentages)
                else:
                    return {'item': 'No Data', 'url': url}
            else:
                return {'item': 'No Data', 'url': url}

        materials = {}

        for material_comp in fabric_percentages:
            material_percentage = material_comp.split('% ') 
            percent = material_percentage[0]
            fabric = (material_percentage[1]).lower()
            materials[fabric] = percent

            final_data = {'item': title}
            final_data.update(materials)

        # Close the WebDriver once done to avoid multiple instances
        driver.quit()

        return final_data
            
    except Exception as e:
        # print(e)
        return {'item': 'No Data', 'url': url}

In [33]:
top_row = df.iloc[2]
links_list = top_row["Links"].split('\n')

for link in links_list:
    print(link)
    scraped_data = scrape_shein(link)
    if scraped_data:
        print(scraped_data)
    print()

https://shein.top/vr08o2i
{'item': "SHEIN Slayr Women's Butterfly & Letter Print Drop Shoulder T-Shirt And Cycling Shorts Set", 'polyester': '100'}

https://shein.top/qz0ci5y
{'item': 'No Data', 'url': 'https://shein.top/qz0ci5y'}

https://shein.top/gq7i4gs
{'item': "SHEIN VCAY Women's Off Shoulder Vacation Dress", 'polyester': '97', 'elastane': '3'}

https://shein.top/hs8v8ex
{'item': "SHEIN Privé Women's Off-Shoulder Lace Patchwork Maxi Dress With Ruffle Trim", 'polyester': '100'}

https://shein.top/gq7i3g1
{'item': "SHEIN Privé Ladies' Off-Shoulder Long-Sleeved Blouse With Ruffle Hem", 'polyester': '100'}

https://shein.top/8s3g65g
{'item': 'SHEIN Frenchy Floral Print Ruffled Hem Blouse', 'polyester': '100'}

https://shein.top/hs8v6ay
{'item': "SHEIN Tween Boys' Loose Fit Color Block Sports Vest And Shorts Set", 'polyester': '100'}

https://shein.top/pwyznup
{'item': "SHEIN Tween Boys' Loose Fit Color Block Soccer Pattern Round Neck Short Raglan Sleeve T-Shirt", 'polyester': '95', '

In [34]:
def scrape_and_update(row):
    links_list = row["Links"].split('\n')
    scraped_data_combined = ""

    for link in links_list:
        print(f"Scraping data from link: {link}")
        scraped_data = scrape_shein(link)
        
        scraped_data_combined += str(scraped_data) + "\n"
        print(scraped_data)
        
        # time.sleep(3)
        print()

    return scraped_data_combined

In [35]:
df['ScrapedData'] = df.apply(scrape_and_update, axis=1)

Scraping data from link: https://bit.ly/3vWbA8e
{'item': 'SHEIN EZwear Summer High Waist Straight Leg Jeans', 'cotton': '90', 'polyester': '7', 'viscose': '3'}

Scraping data from link: https://bit.ly/3TUp7p7
{'item': 'DAZY Zip Up Drawstring Hem Drop Shoulder Sweatshirt', 'polyester': '100'}

Scraping data from link: https://bit.ly/4ajcSJz(XL
{'item': 'Unsupported URL', 'url': 'https://bit.ly/4ajcSJz(XL'}

Scraping data from link: https://bit.ly/3TUTzz4
{'item': 'No Data', 'url': 'https://bit.ly/3TUTzz4'}

Scraping data from link: https://bit.ly/4cMr3s9
{'item': 'Mini Minimalist Square Bag With Coin Purse', 'polyester': '100'}

Scraping data from link: https://bit.ly/43QGStQ
{'item': 'Yoga Basic Ribbed Knit Wideband Waist Sports Set', 'polyamide': '92', 'elastane': '8'}

Scraping data from link: https://bit.ly/3vCwQjx
{'item': 'No Data', 'url': 'https://bit.ly/3vCwQjx'}

Scraping data from link: https://bit.ly/43PoBx9(XL
{'item': 'Unsupported URL', 'url': 'https://bit.ly/43PoBx9(XL'}



In [36]:
df.head()

Unnamed: 0,Title,Links,VideoLink,ScrapedData
0,HUGE SHEIN MIDSIZE SPRING HAUL 2024 - Vacation...,https://bit.ly/3vWbA8e\nhttps://bit.ly/3TUp7p7...,https://www.youtube.com/watch?v=HIXMm9JaQkk,{'item': 'SHEIN EZwear Summer High Waist Strai...
1,SHEIN Home Hacks #shein #sheinhaul #shorts,https://bit.ly/4aK9wPD\nhttps://bit.ly/3xpaUy\...,https://www.youtube.com/watch?v=NFDq_zFTa2w,"{'item': 'No Data', 'url': 'https://bit.ly/4aK..."
2,Haul Shein 2024 🌴 Ropa Bonita y Comoda Para el...,https://shein.top/vr08o2i\nhttps://shein.top/q...,https://www.youtube.com/watch?v=BxT5v7zXxVk,"{'item': ""SHEIN Slayr Women's Butterfly & Lett..."
3,"COMPRINHAS NA SHEIN, MELHOR MARCA DE LOOKS FIT...",https://shein.top/iu840iv\nhttps://shein.top/7...,https://www.youtube.com/watch?v=iI4hLLAtB6s,"{'item': 'Unsupported URL', 'url': 'https://sh..."
4,Night Suit Haul #afordable #trending #shein #v...,https://m.shein.com/ar-en/Geometric-Printed-Sh...,https://www.youtube.com/watch?v=okd4gpJVd40,"{'item': 'No Data', 'url': 'https://m.shein.co..."


In [37]:
df.to_csv('shein_materials.csv', index=False)