In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from bs4 import BeautifulSoup
import requests
import time

In [4]:
def scrape_abercrombie(url):
    try:
        # Set Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        
        # Initialize the WebDriver
        with webdriver.Chrome(options=chrome_options) as driver:
            driver.get(url)
            
            # Wait for the page to fully load
            WebDriverWait(driver, 25).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body')))
            
            # Get the page source after interactions
            page_source = driver.page_source
        
        # Parse the page source with Beautiful Soup
        soup = BeautifulSoup(page_source, "html.parser")
        
        data_title = soup.find("h1", class_="product-title-component")
        data_element = soup.find("h4", class_="h4 fabric-care-mfe__label")
        
        if data_title and data_element:
            item = data_title.text.strip()
            materials = data_element.text.strip()
            return process_item_data(item, materials)
        else:
            return {"error": "Link no longer available or missing data."}
            
    except Exception as e:
        return {"error": f"An error occurred: {str(e)}"}

def process_item_data(item, materials):
    item_dict = {'item': item}
    
    # Process materials
    if 'Body:' in materials:
        components = materials.split('/')
        for component in components:
            component = component.strip()
            if component.startswith('Body:'):
                _, composition = component.split(':', 1)
                parts = composition.strip().split()
                
                for part in parts:
                    if '%' in part:
                        try:
                            percentage = int(part.split('%')[0])
                            material_index = parts.index(part) + 1
                            if material_index < len(parts):
                                material = parts[material_index].strip()
                                material = ''.join(char for char in material if char.isalnum() or char.isspace())
                                item_dict[material.lower()] = percentage
                        except ValueError:
                            continue
    else:
        item_dict['materials'] = "No body material info available."
        
    return item_dict

In [5]:
def scrape_and_update(row):
    links_list = row["Links"].split('\n')
    scraped_data_combined = ""

    for link in links_list:
        print(f"Scraping data from link: {link}")
        scraped_data = scrape_abercrombie(link)
        if scraped_data:
            scraped_data_combined += str(scraped_data) + "\n"
            print(scraped_data)
        time.sleep(3)
        print()

    return scraped_data_combined

In [9]:
df = pd.read_csv("../../data/youtube_data/abercrombie_and_fitch_youtube_data.csv")
df2 = pd.read_csv("../../data/youtube_data/abercrombie_youtube_data.csv")


In [7]:
df['ScrapedData'] = df.apply(scrape_and_update, axis=1)

Scraping data from link: https://rstyle.me/+StTDgwXXE8352sPDHVJg9Q
{'item': 'Cropped Squareneck Rib Tank', 'cotton': 95, 'elastane': 5}

Scraping data from link: https://rstyle.me/+Rt1Fs5m07f2aAIkO6Q9rzQ
{'item': 'Curve Love High Rise Dad Short', 'cotton': 99, 'elastane': 1}

Scraping data from link: https://rstyle.me/+7s4yhOPCfSU0ZvjTeNrnpg
{'item': 'Vintage Sunday Short', 'cotton': 70, 'polyester': 30}

Scraping data from link: https://rstyle.me/+Rl1q0Q3FUpF9-5c_cbaYTQ
{'item': 'Summer Destination Vintage Sunday Crew', 'cotton': 70, 'polyester': 30}

Scraping data from link: https://rstyle.me/+nVGtLLYo74cKU2I9FGQrgQ
{'item': 'Linen-Blend Pull-On Pant', 'linen': 55, 'viscose': 45}

Scraping data from link: https://rstyle.me/+ZO-gEOjJI19nU3azWPtxoA
{'item': 'Linen-Blend Set Top', 'linen': 55, 'viscose': 45}

Scraping data from link: https://rstyle.me/+mzkgNnxFNBN8BshcDtic4A
{'item': 'The A&F Scarlett Linen-Blend Wrap Mini Skort', 'linen': 55, 'viscose': 45}

Scraping data from link: ht

In [8]:
df.to_csv('af2_materials.csv', index=False)

In [10]:
df2.head()

Unnamed: 0,Title,Links,VideoLink
0,"ABERCROMBIE WINTER TRY-ON HAUL! (jeans, cardig...",https://rstyle.me/+13bKRVPz3MteevWjaKcLbA\nhtt...,https://www.youtube.com/watch?v=CVr4GYqfDkQ
1,HUGE ARITZIA &amp; ABERCROMBIE TRY ON HAUL - M...,https://rstyle.me/+T592YVozGButQKfTnyAtDA\nhtt...,https://www.youtube.com/watch?v=BLyA-WAZeto
2,"Spring Fashion Try On | Uniqlo, Cos, H&amp;M, ...",https://rstyle.me/+IgC0RdDQdNOikBQRFXy1gQ\nhtt...,https://www.youtube.com/watch?v=uG3VPnTxr7A
3,"Abercrombie &amp; Fitch Away REVIEW - Fresh, C...",https://bit.ly/3nWpNON\nhttps://michaelmalul.c...,https://www.youtube.com/watch?v=RZCPaVA1e28
4,$2000 ABERCROMBIE TRY ON HAUL | HOLIDAY EDITION,https://rstyle.me/+PMmSYeTx1TKEZfzjfaOdoQ\nhtt...,https://www.youtube.com/watch?v=_NmgMT-0Pp0


In [11]:
df2['ScrapedData'] = df2.apply(scrape_and_update, axis=1)

Scraping data from link: https://rstyle.me/+13bKRVPz3MteevWjaKcLbA
{'error': 'Link no longer available or missing data.'}

Scraping data from link: https://rstyle.me/+xPiFXbwg25YOdIWJ3ki81g
{'item': 'Elevated Trench Coat', 'materials': 'No body material info available.'}

Scraping data from link: https://rstyle.me/+ooiA7svRXUtS2Cq6tKa0ww
{'error': 'Link no longer available or missing data.'}

Scraping data from link: https://rstyle.me/+1LjCVCtZvRU-z0Qx4V5e0A
{'item': 'Cable Short Cardigan', 'cotton': 60, 'viscose': 20}

Scraping data from link: https://rstyle.me/+iGupvCqMitSgSiMjkeLs4A
{'error': 'Link no longer available or missing data.'}

Scraping data from link: https://rstyle.me/+e8eLSf0i-tqn9kcQCtLKrQ
{'error': 'Link no longer available or missing data.'}

Scraping data from link: https://rstyle.me/+XVfRldLMT5fGOMJZCM1xIQ
{'item': 'Soft Matte Seamless Long-Sleeve Crew Bodysuit', 'nylon': 79, 'elastane': 21}

Scraping data from link: https://rstyle.me/+V8Mk2gmkzz-ZPS7lbUBw4g
{'item

In [12]:
df2.to_csv('af2_materials.csv', index=False)