In [1]:
# imports
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

In [37]:
df = pd.read_csv('../../data/youtube_data/mango_youtube_data.csv')

df.head(2)

Unnamed: 0,Title,Links,VideoLink
0,SPRING HAUL &amp; TRY ON | inc. MANGO SEZANE &...,https://rstyle.me/+2o8zucCxq7qCSZZUXYbA3g\nhtt...,https://www.youtube.com/watch?v=mg9D_L0hmQI


In [44]:
def scrape_mango(url):
    try:
        # Set Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")

        # Initialize the WebDriver
        driver = webdriver.Chrome(options=chrome_options)
        
        # Open the webpage
        driver.get(url)

        if 'mango' not in driver.current_url:
            return {'item': 'Unsupported URL', 'url': url}
        
        # Wait for the page to fully load
        time.sleep(5) 
        
        # Get the page source after interactions
        page_source = driver.page_source

        # get title
        title = (driver.find_element(By.CLASS_NAME, 'product-name')).text
        
        # Close the WebDriver once done to avoid multiple instances
        driver.quit()
        
        # Parse the page source with lxml and XPath
        tree = html.fromstring(page_source)
        
        # Extract data using XPath expressions
        data_element = (tree.xpath('//*[@id="productDesktop"]/main/div/div[3]/div[1]/div[2]/div/p[1]/text()'))[0]

        composition = (data_element.split('.')[0]).split(': ')[1]

        fabric_percentages = composition.split(',')

        materials = {}

        for material_comp in fabric_percentages:
            material_percentage = material_comp.split('% ') 
            percent = material_percentage[0]
            fabric = ((material_percentage[1]).lower()).strip()
            materials[fabric] = percent

            final_data = {'item': title}
            final_data.update(materials)

        return final_data

            
    except Exception as e:
        return {'item': 'No Data', 'url': url}

In [36]:
# URL of the webpage you want to scrape
url = 'https://shop.mango.com/us/women/blazers-suit-jackets/vest-with-tie-closure_67067136.html?c=40'

# Call the function to scrape the website
data = scrape_mango(url)
print(data)

{'item': 'Vest with tie closure', 'viscose': '51', 'linen': '26', 'cotton': '23'}


In [38]:
def scrape_and_update(row):
    links_list = row["Links"].split('\n')
    scraped_data_combined = ""

    for link in links_list:
        scraped_data = scrape_mango(link)
        scraped_data_combined += str(scraped_data) + "\n"

    return scraped_data_combined

In [45]:
df['ScrapedData'] = df.apply(scrape_and_update, axis=1)

In [46]:
df.to_csv('materials_data/mango_materials.csv', index=False)