In [9]:
# imports
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

In [10]:
df = pd.read_csv('../../data/youtube_data/asos_youtube_data.csv')

df.head(2)

Unnamed: 0,Title,Links,VideoLink
0,**NEW IN** ASOS SPRING 2024 HAUL + TRY ON | I ...,https://go.shopmy.us/p-4051450,https://www.youtube.com/watch?v=7bgLJhXuLJE
1,"Huge EASTER Spring Dress TRY ON HAUL *ASOS, AW...",https://www.awbridal.com/aw-coralia-dress-lf22...,https://www.youtube.com/watch?v=xbd-nv6uEI4


In [15]:
def scrape_asos(url):
    try:
        chrome_options = Options()
        driver = webdriver.Chrome(options=chrome_options)
        
        # Open the webpage
        driver.get(url)

        if 'asos' not in driver.current_url:
            return {'item': 'Unsupported URL', 'url': url}

        # Wait for the specified time before clicking the interactive element
        time.sleep(10)  # Wait for the specified time in seconds

        # Find the interactive element
        interactive_element_xpath = '//*[@id="productDescription"]/ul/li[5]/div/h2'
        interactive_element = driver.find_element(By.XPATH, interactive_element_xpath)
        
        # Click the interactive element
        interactive_element.click()

        # Wait for the loaded content to be visible
        loaded_content_xpath = '//*[@id="productDescriptionAboutMe"]'
        loaded_element = WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.XPATH, loaded_content_xpath))
        )

        # Once loaded, scrape the content
        product_info = loaded_element.text

        # get title
        title = (driver.find_element(By.CLASS_NAME, 'jcdpl')).text

        # get list of materials and their percentages
        fabric_percentages = ([elem for elem in product_info.split('\n') if '%' in elem][0]).rstrip('.')

        if fabric_percentages.count(':') > 1:
            if 'Main' in fabric_percentages:
                fabric_percentages = (fabric_percentages.split('Main: ')[1])
            else:
                fabric_percentages = (fabric_percentages.split('Body: ')[1])
        elif fabric_percentages.count(':') == 1:
            fabric_percentages = (fabric_percentages.split(': ')[1])
        
        fabric_percentages = fabric_percentages.split(', ')

        materials = {}

        for material_comp in fabric_percentages:
            material_percentage = material_comp.split('% ') 
            percent = material_percentage[0]
            fabric = (material_percentage[1]).lower()
            materials[fabric] = percent

            final_data = {'item': title}
            final_data.update(materials)

        return final_data
        
    except:
        return {'item': 'No Data', 'url': url}
        
    finally:
        # Close the WebDriver
        driver.quit()

In [16]:
# Example usage:
url = "https://www.asos.com/us/asos-design/asos-design-pleated-mini-skirt-in-gray/prd/205661997#colourWayId-205661998"

material_info = scrape_asos(url)
print(material_info)

{'item': 'ASOS DESIGN pleated mini skirt in gray', 'polyester': '76', 'viscose': '18', 'elastane': '6'}


In [17]:
def scrape_and_update(row):
    links_list = row["Links"].split('\n')
    scraped_data_combined = ""

    for link in links_list:
        scraped_data = scrape_asos(link)
        scraped_data_combined += str(scraped_data) + "\n"

    return scraped_data_combined

In [None]:
df['ScrapedData'] = df.apply(scrape_and_update, axis=1)

In [19]:
df.to_csv('materials_data/asos_materials.csv', index=False)