In [47]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from bs4 import BeautifulSoup
import requests
import time
import re

In [49]:
df = pd.read_csv("../../data/youtube_data/uniqlo_youtube_data.csv")

In [60]:
df.sample(1)

Unnamed: 0,Title,Links,VideoLink
4,I ordered EVERY JEANS From UNIQLO - WORTH IT? ...,https://www.uniqlo.com/in/en/special-feature/j...,https://www.youtube.com/watch?v=QKhGbM_GkYo


In [57]:
def scrape_uniqlo(url, wait_time=2):
    try:
        user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"
        # Set Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument(f"user-agent={user_agent}")
        interactive_element_xpath = '//*[@id="productMaterialDescription"]'
        loaded_content_xpath = '//*[@id="productMaterialDescription-content"]/dl/dd[1]/p'
        title_xpath = '//*[@id="root"]/div[4]/div/section/div[2]/div[2]/div[1]/div/ul/li[1]/h1'

        # Initialize the WebDriver with headless mode
        driver = webdriver.Chrome(options=chrome_options)
        
        # Open the webpage
        driver.get(url)

        # Wait for the specified time before clicking the interactive element
        time.sleep(wait_time)  # Wait for the specified time in seconds

        # Find the interactive element
        interactive_element = driver.find_element(By.XPATH, interactive_element_xpath)
        
        # Click the interactive element
        interactive_element.click()

        # Wait for the loaded content to be visible
        loaded_element = WebDriverWait(driver, wait_time).until(
            EC.visibility_of_element_located((By.XPATH, loaded_content_xpath))
        )

        # Once loaded, scrape the content
        dynamic_content = loaded_element.text
        
        # Scrape the title
        title_element = driver.find_element(By.XPATH, title_xpath)
        title = title_element.text
        
        # Extract all material percentages from the dynamic content
        material_pattern = r'(\d+)%\s*(\w+)'
        material_matches = re.findall(material_pattern, dynamic_content)
        
        # Create a dictionary to store the scraped data
        scraped_data = {"item": title}
        for percent, material in material_matches:
            # Convert percentage to integer
            percent = int(percent)
            # Update the dictionary with the material percentage
            scraped_data[material.lower()] = percent
        
        return scraped_data
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None
        
    finally:
        # Close the WebDriver
        driver.quit()


In [61]:
def scrape_and_update(row):
    links_list = row["Links"].split('\n')
    scraped_data_combined = ""

    for link in links_list:
        print(f"Scraping data from link: {link}")
        scraped_data = scrape_uniqlo(link)
        if scraped_data:
            scraped_data_combined += str(scraped_data) + "\n"
        time.sleep(3)

    return scraped_data_combined

In [66]:
df_test_2024_small = pd.read_csv("../../data/youtube_data/SMALL-uniqlo-data.csv")

In [68]:
df_test_2024_small["ScrapedData"] = df_test_2024_small.apply(scrape_and_update, axis=1)

Scraping data from link: https://c.klarna.com/al/AGrw/
An error occurred: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="productMaterialDescription"]"}
  (Session info: chrome-headless-shell=124.0.6367.62); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x00000001007aa8dc chromedriver + 4368604
1   chromedriver                        0x00000001007a2d70 chromedriver + 4337008
2   chromedriver                        0x00000001003c6c04 chromedriver + 289796
3   chromedriver                        0x0000000100408e00 chromedriver + 560640
4   chromedriver                        0x00000001004415ec chromedriver + 792044
5   chromedriver                        0x00000001003fdab4 chromedriver + 514740
6   chromedriver                        0x00000001003fe50c chromedriver + 517388
7   chromedriver       

In [70]:
df_test_2024_small.to_csv('SMALL_uniqlo_materials.csv', index=False)

In [62]:
df["ScrapedData"] = df.apply(scrape_and_update, axis=1)

Scraping data from link: https://www.uniqlo.com/us/en/products/E456215-000/00?colorDisplayCode=30&sizeDisplayCode=001
Scraping data from link: https://www.uniqlo.com/us/en/products/E455756-000/00?colorDisplayCode=00&sizeDisplayCode=003
Scraping data from link: https://www.uniqlo.com/us/en/products/E454767-000/00?colorDisplayCode=01&sizeDisplayCode=004
Scraping data from link: https://www.uniqlo.com/us/en/products/E455749-000/00?colorDisplayCode=71&sizeDisplayCode=002
Scraping data from link: https://www.uniqlo.com/us/en/products/E458619-000/00?colorDisplayCode=69&sizeDisplayCode=002
Scraping data from link: https://www.uniqlo.com/us/en/products/E459580-000/00?colorDisplayCode=01&sizeDisplayCode=999
Scraping data from link: https://t.me/onechanceTG
An error occurred: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="productMaterialDescription"]"}
  (Session info: chrome-headless-shell=124.0.6367.62); For documentation on this error, please visit:

In [71]:
df.to_csv('uniqlo_videos_materials.csv', index=False)

In [None]:
# test code!! It works!!
top_row = df.iloc[0]
links_list = top_row["Links"].split('\n')

for link in links_list:
    print(f"Scraping data from Uniqlo link: {link}")
    scraped_data = scrape_uniqlo(link)
    if scraped_data:
        print(scraped_data)
        print()
    time.sleep(3)