In [1]:
# imports
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

In [2]:
df = pd.read_csv('../../data/youtube_data/patagonia_youtube_data.csv')

df.head(2)

Unnamed: 0,Title,Links,VideoLink
0,Patagonia Guidewater Backpack Review,https://bit.ly/4aIg9mh\nhttps://bit.ly/40RxmFb...,https://www.youtube.com/watch?v=1r-sdnACmsA
1,[박영준TV] [Review] Patagonia Capilene Cool Daily...,https://cafe.naver.com/windstopper,https://www.youtube.com/watch?v=jNL9GO2yCI0


In [57]:
def scrape_patagonia(url):
    try:
        # Set Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        
        # Initialize the WebDriver
        driver = webdriver.Chrome(options=chrome_options)
        
        # Open the webpage
        driver.get(url)

        if 'patagonia.com' not in driver.current_url:
            return {'item': 'Unsupported URL', 'url': url}
        
        # Wait for the page to fully load
        time.sleep(5)  # Adjust the wait time as needed
        
        # Get the page source after interactions
        page_source = driver.page_source

        soup = BeautifulSoup(page_source, "html.parser")

        # get title
        # title = (driver.find_element(By.CLASS_NAME, 'h5 pdp-intro__title')).text
        title = ((soup.find('h1', class_='h5 pdp-intro__title')).text).strip()
        
        # Close the WebDriver once done to avoid multiple instances
        driver.quit()
        
        materials_element = soup.find('div', id='collapsible-2')

        # Extract all text from the div element
        all_text = (materials_element.get_text(separator='\n', strip=True)).split('\n')

        possible_materials = [material for material in all_text if '%' in material][0]

        # list of materials to match
        materials_list = ['cotton', 'recycled cotton', 'organic cotton', 'polyester', 'recycled polyester', 'nylon',
            'recycled nylon', 'acrylic', 'spandex', 'flax', 'linen', 'hemp', 'cupro', 'lyocell', 'tencel',
            'refibra', 'modal', 'tencel modal', 'viscose', 'bamboo', 'lenzing viscose', 'ecovero', 'silk',
            'alpaca', 'wool', 'recycled wool', 'cashmere', 'recycled Cashmere']

        # create a regex pattern to match any material from the list
        material_pattern = '|'.join(materials_list)

        # regex pattern to match the specific percentages and material
        pattern = rf'(\d+)% ({material_pattern})'

        fabric_percentages = re.findall(pattern, possible_materials)
    
        percentages = [int(material[0]) for material in fabric_percentages]
        # make sure percentages sum to 100
        if sum(percentages) == 100:
            materials = {}

            for material_comp in fabric_percentages:
                percent = material_comp[0]
                fabric = (material_comp[1]).lower()
                materials[fabric] = percent

                final_data = {'item': title}
                final_data.update(materials)

        return final_data

    except:
        return {'item': 'No Data', 'url': url}

In [58]:
# URL of the webpage you want to scrape
urls = [
    'https://www.patagonia.com/product/mens-classic-retro-x-fleece-jacket/195699845640.html?s_kwcid=17928&utm_source=google&utm_medium=cpc&utm_campaign=BB_Ecomm_Shopping_ALL_WBSP_SaleKWs&gad_source=1&gclid=CjwKCAjw5v2wBhBrEiwAXDDoJd8XQGUW_sof8fEfoDRQUdlBPIlrxyB6fyngvB0--73eUmTbzjhBZBoCa84QAvD_BwE',
    'https://www.patagonia.com/product/womens-mainstay-lightweight-top/42315.html?dwvar_42315_color=LTPG&cgid=womens-tops-short-sleeve',
    'https://www.patagonia.com/product/guidewater-submersible-waterproof-backpack-29-liters/49165.html?avad=286573_f3964eac5&netid=1&pubid=228673&utm_source=www.thewadinglist.com&utm_medium=affiliate&utm_campaign=Custom+Link&src=cl&src=avl'
]

for url in urls:
    # Call the function to scrape the website
    data = scrape_patagonia(url)
    print(data)

{'item': "Men's Classic Retro-X® Fleece Jacket", 'polyester': '100'}
{'item': "Women's Mainstay Top", 'organic cotton': '60', 'recycled polyester': '40'}
{'item': 'Guidewater Backpack 29L', 'recycled nylon': '100'}


In [54]:
def scrape_and_update(row):
    links_list = row["Links"].split('\n')
    scraped_data_combined = ""

    for link in links_list:
        scraped_data = scrape_patagonia(link)
        scraped_data_combined += str(scraped_data) + "\n"

    return scraped_data_combined

In [59]:
df['ScrapedData'] = df.apply(scrape_and_update, axis=1)

In [60]:
df.to_csv('materials_data/patagonia_materials.csv', index=False)