In [2]:
# imports
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

In [3]:
df = pd.read_csv('../../data/youtube_data/primark_youtube_data.csv')

df.head(2)    

Unnamed: 0,Title,Links,VideoLink
0,NEW IN PRIMARK &amp; TRY ON HAUL + RIVER ISLAN...,https://rstyle.me/+qmS93EYalkwzIXFVo0LSSQ\nhtt...,https://www.youtube.com/watch?v=6aW6Db_13Zs
1,"Come Shopping With Us 🛍️ | NEW IN H&amp;M, Pri...",https://www.shopLTK.com/explore/elliepearce,https://www.youtube.com/watch?v=x7-iuXE4F9s


In [30]:
def scrape_primark(url):
    try:
        # Set Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        
        # Initialize the WebDriver
        driver = webdriver.Chrome(options=chrome_options)
        
        # Open the webpage
        driver.get(url)

        if 'primark' not in driver.current_url:
            return {'item': 'Unsupported URL', 'url': url}
        
        # Wait for the page to fully load
        time.sleep(5)  # Adjust the wait time as needed
        
        # Get the page source after interactions
        page_source = driver.page_source

        soup = BeautifulSoup(page_source, "html.parser")

        # get title
        # title = ((soup.find('h1', class_='MuiTypography-root jss1369 MuiTypography-body1')).text).strip()
        title_element = soup.find(attrs={"data-testautomation-id": "product-title"})
        title = title_element.text
        # print(title_element)
        
        # Close the WebDriver once done to avoid multiple instances
        driver.quit()
        
        # Parse the page source with lxml and XPath
        tree = html.fromstring(page_source)
        
        # Extract data using XPath expressions
        data_element = tree.xpath("//p[@class='MuiTypography-root MuiTypography-body1']/text()")
        data_element = ('\n'.join(data_element).strip()).lower()

        # list of materials to match
        materials_list = ['cotton', 'recycled cotton', 'organic cotton', 'polyester', 'recycled polyester', 'nylon',
            'recycled nylon', 'acrylic', 'spandex', 'elastane', 'flax', 'linen', 'hemp', 'cupro', 'lyocell', 'tencel',
            'refibra', 'modal', 'tencel modal', 'viscose', 'bamboo', 'lenzing viscose', 'ecovero', 'silk',
            'alpaca', 'wool', 'recycled wool', 'cashmere', 'recycled Cashmere']

        # create a regex pattern to match any material from the list
        material_pattern = '|'.join(materials_list)

        # regex pattern to match the specific percentages and material
        pattern = rf'(\d+)% ({material_pattern})'

        fabric_percentages = re.findall(pattern, data_element)
        # print(fabric_percentages)
    
        percentages = [int(material[0]) for material in fabric_percentages]
        # make sure percentages sum to 100
        if sum(percentages) == 100:
            materials = {}

            for material_comp in fabric_percentages:
                percent = material_comp[0]
                fabric = (material_comp[1]).lower()
                materials[fabric] = percent

                final_data = {'item': title}
                final_data.update(materials)

        return final_data

    except Exception as e:
        return {'item': 'No Data', 'url': url}

In [17]:
# URL of the webpage you want to scrape
urls = [
    'https://www.primark.com/en-us/p/paula-echevarria-long-sleeve-crochet-top-cream-991100728116',
    'https://www.primark.com/en-us/p/angel-sleeve-midi-dress-blue-991102623505',
    'https://www.primark.com/en-us/p/lobster-print-swim-shorts-red-991090592352'
]

for url in urls:
    # Call the function to scrape the website
    data = scrape_primark(url)
    print(data)

{'item': 'Paula Echevarría Long Sleeve Crochet Top', 'cotton': '91', 'nylon': '9'}
{'item': 'Angel Sleeve Midi Dress', 'viscose': '86', 'nylon': '14'}
{'item': 'Lobster Print Swim Shorts', 'polyester': '100'}


In [32]:
def scrape_and_update(row):
    links_list = row["Links"].split('\n')
    scraped_data_combined = ""

    for link in links_list:
        scraped_data = scrape_primark(link)
        scraped_data_combined += str(scraped_data) + "\n"

    return scraped_data_combined

In [33]:
df['ScrapedData'] = df.apply(scrape_and_update, axis=1)

In [None]:
df.to_csv('materials_data/primark_materials.csv', index=False)