In [2]:
# import statements
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from bs4 import BeautifulSoup
import requests
import time
import re

In [4]:
df = pd.read_csv("../../data/youtube_data/shein_youtube_data.csv")
df.head(2)

Unnamed: 0,Title,Links
0,"HUGE SHEIN SUMMER HAUL | jewerly, sunglasses, ...",http://api-shein.shein.com/h5/sharejump/appjum...
1,Disturbing message in SHEIN clothing goes vira...,https://www.vox.com/the-goods/22573682/shein-f...


In [107]:
def scrape_shein(url):
    try:
        # Set Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        # Initialize the WebDriver
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)
        # Get the page source after interactions
        page_source = driver.page_source
        # Close the WebDriver once done to avoid multiple instances
        driver.quit()
        # Parse the page source with Beautiful Soup
        soup = BeautifulSoup(page_source, "html.parser")
        
        # get name of product
        title_element = soup.find("h1", class_="product-intro__head-name fsp-element")
        title = title_element.text.strip()
        # print(title)
        
        # Find the div with class "key" containing "Composition: "
        pattern = re.compile(r"Composition: | Pants & Tops Composition: | Pants Composition: | Tops Composition: ")
        composition_key = soup.find("div", class_="key", string=pattern)
        if composition_key:
            materials = {}

            # Get the next sibling of the "key" div, which contains the composition information
            composition_val = composition_key.find_next_sibling("div", class_="val")
            if composition_val:
                # Extract the composition information
                composition_text = composition_val.text.strip()
                # print(composition_text)
                composition_text = composition_text.replace(',', '') 
                fabric_percentages = composition_text.split()
                # print(fabric_percentages)

                for i in range(0, len(fabric_percentages), 2):
                    percent = (fabric_percentages[i]).replace('%', '')
                    fabric = (fabric_percentages[i+1]).lower()
                    materials[fabric] = percent

                final_data = {'item': title}
                final_data.update(materials)

                return final_data
        #     else:
        #         # print('Composition information not found.')
        #         # return "Composition information not found." 
        # else:
            # print('Composition key not found.')
        #     return "Composition key not found." 
    except Exception as e:
        # print(f'An error occurred: {str(e)}')
        None

In [50]:
# URL of the webpage you want to scrape
url = "https://us.shein.com/Solid-Bandeau-Bra-p-12216010.html?src_module=All&src_identifier=on=PRODUCT_ITEMS_COMPONENT`cn=salezone`hz=0`ps=4_1_0`jc=itemPicking_100546960&src_tab_page_id=page_home1713242748965&mallCode=1&imgRatio=3-4"
# Call the function to scrape the website
data = scrape_shein(url)
print(data)

Solid Bandeau Bra
{'item': 'Solid Bandeau Bra', 'polyamide': '85', 'elastane': '15'}


In [83]:
top_row = df.iloc[2]
links_list = top_row["Links"].split('\n')

for link in links_list:
    print(link)
    scraped_data = scrape_shein(link)
    if scraped_data:
        print(scraped_data)
    print()

https://shein.top/vyozfry
Women's Elegant Brown Butterfly Decorated Square Toe Slip-on Ballet Flats With Bow Tie
Composition information not found.

https://shein.top/6din4ps
SHEIN VCAY Allover Floral Knot Split Thigh A-line Dress
100% Polyester
['100%', 'Polyester']
{'item': 'SHEIN VCAY Allover Floral Knot Split Thigh A-line Dress', 'polyester': '100'}

https://shein.top/f3qr2g5
SHEIN Privé Lantern Sleeve Twist Front Wrap Hem Satin Dress
100% Polyester
['100%', 'Polyester']
{'item': 'SHEIN Privé Lantern Sleeve Twist Front Wrap Hem Satin Dress', 'polyester': '100'}

https://shein.top/r3e0nfm
SHEIN Privé Lantern Sleeve Twist Front Wrap Hem Satin Dress
100% Polyester
['100%', 'Polyester']
{'item': 'SHEIN Privé Lantern Sleeve Twist Front Wrap Hem Satin Dress', 'polyester': '100'}

https://shein.top/thdyp3z
Women Colorblock Lace-Up Front Chunky Sneakers
Composition information not found.

https://liketk.it/45Euc
An error occurred: 'NoneType' object has no attribute 'text'



In [108]:
def scrape_and_update(row):
    links_list = row["Links"].split('\n')
    scraped_data_combined = ""

    for link in links_list:
        print(f"Scraping data from link: {link}")
        scraped_data = scrape_shein(link)
        if scraped_data:
            scraped_data_combined += str(scraped_data) + "\n"
            print(scraped_data)
        time.sleep(3)
        print()

    return scraped_data_combined

In [110]:
df['ScrapedData'] = df.apply(scrape_and_update, axis=1)

Scraping data from link: http://api-shein.shein.com/h5/sharejump/appjump?link=Vwej1ModnvN&localcountry=US&url_from=GM7396757845416689664
Scraping data from link: http://api-shein.shein.com/h5/sharejump/appjump?link=Vwej6GkXSZH&localcountry=US&url_from=GM7396758049551491072
Scraping data from link: http://api-shein.shein.com/h5/sharejump/appjump?link=VwejoSGtbpc&localcountry=US&url_from=GM7396758154843078656
Scraping data from link: http://api-shein.shein.com/h5/sharejump/appjump?link=VwejbrRK8A6&localcountry=US&url_from=GM7396758187885805568
Scraping data from link: http://api-shein.shein.com/h5/sharejump/appjump?link=VwejIr7lxt3&localcountry=US&url_from=GM7396758216314798080
Scraping data from link: http://api-shein.shein.com/h5/sharejump/appjump?link=Vwejt0qIlad&localcountry=US&url_from=GM7396758251102355456
Scraping data from link: http://api-shein.shein.com/h5/sharejump/appjump?link=VwejKixkFJw&localcountry=US&url_from=GM7396758284218605568
Scraping data from link: http://api-shein

In [111]:
df.head()

Unnamed: 0,Title,Links,ScrapedData
0,"HUGE SHEIN SUMMER HAUL | jewerly, sunglasses, ...",http://api-shein.shein.com/h5/sharejump/appjum...,
1,Disturbing message in SHEIN clothing goes vira...,https://www.vox.com/the-goods/22573682/shein-f...,
2,Shein haul review,https://shein.top/vyozfry\nhttps://shein.top/6...,{'item': 'SHEIN VCAY Allover Floral Knot Split...
3,HUGE SHEIN TRY-ON HAUL 2023 *summer edition* |...,http://api-shein.shein.com/h5/sharejump/appjum...,
4,"SHEIN unboxing haul: dresses, workwear, lounge...",https://liketk.it/40R4F,


In [112]:
df.to_csv('shein_materials.csv', index=False)