# This was a workspace, f21_ver2.ipynb works the best

In [30]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from bs4 import BeautifulSoup
import requests
import time
import re

In [6]:
# USES SELENIUM
def scrape_f21(url, target_element_xpath, wait_time=10):
    try:
        chrome_options = Options()

        driver = webdriver.Chrome(options=chrome_options)
        
        driver.get(url)

        product_name_element = WebDriverWait(driver, wait_time).until(
            EC.visibility_of_element_located((By.CLASS_NAME, "pdp__name"))
        )
        product_name = product_name_element.text

        target_element = WebDriverWait(driver, wait_time).until(
            EC.visibility_of_element_located((By.XPATH, target_element_xpath))
        )

        dynamic_content = target_element.text
        
        return product_name, dynamic_content
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None
        
    finally:
        # Close the WebDriver
        driver.quit()

url = "https://www.forever21.com/us/2001274878.html"
target_element_xpath = '//*[@id="main"]/div[2]/div[1]/div[2]/div[3]/div/div[6]/div/div[1]/section[2]/div'

dynamic_content = scrape_f21(url, target_element_xpath)
if dynamic_content:
    print(dynamic_content)

('Bubble-Hem Babydoll Mini Dress', '- Shell: 77% rayon, 23% nylon\n- Lining: 100% polyester\n- Hand wash cold')


Below uses beautiful soup and requests!

In [43]:
def scrape_f21(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status() 

        soup = BeautifulSoup(response.text, 'html.parser')

        product_name = soup.find('h1', class_='pdp__name')
        product_name = product_name.get_text(strip=True) if product_name else "Product name not found"

        content = soup.find('h3', string='Content + Care').find_next('div', class_='d_content').get_text(strip=True)


        return product_name, content

    except requests.HTTPError as e:
        print(f"HTTP error occurred: {str(e)}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    return None

urls = [
    "https://www.forever21.com/us/2001274878.html",
    "https://www.forever21.com/us/20012736030102.html",
    "https://www.forever21.com/us/2001274138.html",
    "https://www.forever21.com/us/20004974310102.html",
    "https://www.forever21.com/us/20004709370602.html?source=shoppingads&glCountry=US&glCurrency=USD&utm_source=google&utm_medium=cpc&utm_campaign=FPMX_StandardShopping_BrandTEST&utm_id=71700000118295495&gad_source=1&gclid=Cj0KCQjw0MexBhD3ARIsAEI3WHKdcjYy1wEfkj2sr7imRFNivlIrBz1F4zPtTliRnHGlzHzy-3CWRDMaAmhpEALw_wcB&gclsrc=aw.ds",
    "https://www.forever21.com/us/20012730370205.html"
]

for url in urls:
    print("scraping link", url)
    product_info = scrape_f21(url)
    if product_info:
        print(product_info)
    


scraping link https://www.forever21.com/us/2001274878.html
('Bubble-Hem Babydoll Mini Dress', '- Shell: 77% rayon, 23% nylon- Lining: 100% polyester- Hand wash cold')
scraping link https://www.forever21.com/us/20012736030102.html
HTTP error occurred: 403 Client Error: Forbidden for url: https://www.forever21.com/us/20012736030102.html
scraping link https://www.forever21.com/us/2001274138.html
('Mesh Floral Print Midi Skirt', '- Shell: 94% polyester, 6% spandex- Lining: 96% polyester, 4% spandex- Hand wash cold')
scraping link https://www.forever21.com/us/20004974310102.html
('Mesh Draped-Sleeve Mini Dress', '- Shell, Lining, & Other contents: 100% polyester- Hand wash cold')
scraping link https://www.forever21.com/us/20004709370602.html?source=shoppingads&glCountry=US&glCurrency=USD&utm_source=google&utm_medium=cpc&utm_campaign=FPMX_StandardShopping_BrandTEST&utm_id=71700000118295495&gad_source=1&gclid=Cj0KCQjw0MexBhD3ARIsAEI3WHKdcjYy1wEfkj2sr7imRFNivlIrBz1F4zPtTliRnHGlzHzy-3CWRDMaAmhp

In [41]:
import requests
from bs4 import BeautifulSoup
from time import sleep
import random

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
]

def scrape_f21(url, max_retries=3):
    session = requests.Session()  
    retries = 0

    while retries < max_retries:
        try:
            headers = {'User-Agent': random.choice(USER_AGENTS)}
            response = session.get(url, headers=headers)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            product_name = soup.find('h1', class_='pdp__name')
            product_name = product_name.get_text(strip=True) if product_name else "Product name not found"
            content = soup.find('h3', string='Content + Care').find_next('div', class_='d_content').get_text(strip=True)
            
            product = (product_name, content)

            return format_product_info(product)

        except requests.HTTPError as e:
            if e.response.status_code == 403:
                print(f"Attempt {retries+1}: HTTP error 403 encountered. Retrying...")
                retries += 1
                sleep(5) 
            else:
                print(f"HTTP error occurred: {str(e)}")
                break
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            break
    return {'error': "error occurred"}

urls = [
    "https://www.forever21.com/us/2001274878.html",
    "https://www.forever21.com/us/20012736030102.html",
    "https://www.forever21.com/us/2001274138.html",
    "https://www.forever21.com/us/20004974310102.html",
    "https://www.forever21.com/us/20004709370602.html?source=shoppingads&glCountry=US&glCurrency=USD&utm_source=google&utm_medium=cpc&utm_campaign=FPMX_StandardShopping_BrandTEST&utm_id=71700000118295495&gad_source=1&gclid=Cj0KCQjw0MexBhD3ARIsAEI3WHKdcjYy1wEfkj2sr7imRFNivlIrBz1F4zPtTliRnHGlzHzy-3CWRDMaAmhpEALw_wcB&gclsrc=aw.ds",
    "https://www.forever21.com/us/20012730370205.html"
]

scraped_data = []

for url in urls:
    # print("Scraping link", url)
    product_info = scrape_f21(url)
    if product_info:
        print(product_info)
        scraped_data.append(product_info)


An error occurred: too many values to unpack (expected 2)
{'error': 'error occurred'}
Attempt 1: HTTP error 403 encountered. Retrying...
An error occurred: too many values to unpack (expected 2)
{'error': 'error occurred'}
An error occurred: too many values to unpack (expected 2)
{'error': 'error occurred'}
An error occurred: too many values to unpack (expected 2)
{'error': 'error occurred'}
An error occurred: too many values to unpack (expected 2)
{'error': 'error occurred'}
Attempt 1: HTTP error 403 encountered. Retrying...
Attempt 2: HTTP error 403 encountered. Retrying...
An error occurred: too many values to unpack (expected 2)
{'error': 'error occurred'}


In [32]:
scraped_data

[]

In [40]:
def parse_materials(content):
    print(content)
    material_dict = {}

    pattern = r'(\d+)%\s*([a-zA-Z]+)'

    # Prioritize "Shell 1" over "Shell 2", and "Shell" over "Lining"
    if "Shell 1:" in content:
        # Extract the content specifically for "Shell 1"
        content = content.split("Shell 1:")[1].split("-")[0]
    elif "Shell:" in content:
        # Split the content and avoid considering "Lining" or "Other contents" if "Shell" is present
        content = content.split("Shell:")[1].split("-")[0]

    matches = re.findall(pattern, content)
    for match in matches:
        percentage, material = match
        material_dict[material.lower()] = int(percentage)

    return material_dict

def format_product_info(products):
    formatted_products = []

    for name, content in products:
        
        material_info = parse_materials(content)
        product_dict = {'item': name}
        product_dict.update(material_info)
        formatted_products.append(product_dict)

    return formatted_products


product_info = [
    ('Bubble-Hem Babydoll Mini Dress', '- Shell: 77% rayon, 23% nylon- Lining: 100% polyester- Hand wash cold'),
    ('Tiered Drawstring Maxi Skirt', '- 100% polyester- Hand wash cold'),
    ('Mesh Floral Print Midi Skirt', '- Shell: 94% polyester, 6% spandex- Lining: 96% polyester, 4% spandex- Hand wash cold'),
    ('Mesh Draped-Sleeve Mini Dress', '- Shell, Lining, & Other contents: 100% polyester- Hand wash cold'),
    ('Mesh Slip Mini Dress', '- Shell 1: 95% polyester, 5% spandex- Shell 2: 90% nylon, 10% spandex- Lining: 100% polyester- Hand wash cold'),
    ('Lace-Trim Tube Mini Dress', '- 97% polyester, 3% spandex- Hand wash cold')
]

formatted_products = format_product_info(product_info)
for product in formatted_products:
    print(product)


- Shell: 77% rayon, 23% nylon- Lining: 100% polyester- Hand wash cold
- 100% polyester- Hand wash cold
- Shell: 94% polyester, 6% spandex- Lining: 96% polyester, 4% spandex- Hand wash cold
- Shell, Lining, & Other contents: 100% polyester- Hand wash cold
- Shell 1: 95% polyester, 5% spandex- Shell 2: 90% nylon, 10% spandex- Lining: 100% polyester- Hand wash cold
- 97% polyester, 3% spandex- Hand wash cold
{'item': 'Bubble-Hem Babydoll Mini Dress', 'rayon': 77, 'nylon': 23}
{'item': 'Tiered Drawstring Maxi Skirt', 'polyester': 100}
{'item': 'Mesh Floral Print Midi Skirt', 'polyester': 94, 'spandex': 6}
{'item': 'Mesh Draped-Sleeve Mini Dress', 'polyester': 100}
{'item': 'Mesh Slip Mini Dress', 'polyester': 95, 'spandex': 5}
{'item': 'Lace-Trim Tube Mini Dress', 'polyester': 97, 'spandex': 3}


In [42]:
import re

def format_product_info(products):
    formatted_products = []

    pattern = r'(\d+)%\s*([a-zA-Z]+)'
    
    for name, content in products:
        print(content)  
        material_dict = {}

        # Prioritize "Shell 1" over "Shell 2", and "Shell" over "Lining"
        if "Shell 1:" in content:
            # Extract the content specifically for "Shell 1"
            content = content.split("Shell 1:")[1].split("-")[0]
        elif "Shell:" in content:
            # Split the content and avoid considering "Lining" or "Other contents" if "Shell" is present
            content = content.split("Shell:")[1].split("-")[0]

        matches = re.findall(pattern, content)
        for match in matches:
            percentage, material = match
            material_dict[material.lower()] = int(percentage)

        product_dict = {'item': name}
        product_dict.update(material_dict)
        formatted_products.append(product_dict)

    return formatted_products

product_info = [
    ('Bubble-Hem Babydoll Mini Dress', '- Shell: 77% rayon, 23% nylon- Lining: 100% polyester- Hand wash cold'),
    ('Tiered Drawstring Maxi Skirt', '- 100% polyester- Hand wash cold'),
    ('Mesh Floral Print Midi Skirt', '- Shell: 94% polyester, 6% spandex- Lining: 96% polyester, 4% spandex- Hand wash cold'),
    ('Mesh Draped-Sleeve Mini Dress', '- Shell, Lining, & Other contents: 100% polyester- Hand wash cold'),
    ('Mesh Slip Mini Dress', '- Shell 1: 95% polyester, 5% spandex- Shell 2: 90% nylon, 10% spandex- Lining: 100% polyester- Hand wash cold'),
    ('Lace-Trim Tube Mini Dress', '- 97% polyester, 3% spandex- Hand wash cold')
]

formatted_products = format_product_info(product_info)
for product in formatted_products:
    print(product)


- Shell: 77% rayon, 23% nylon- Lining: 100% polyester- Hand wash cold
- 100% polyester- Hand wash cold
- Shell: 94% polyester, 6% spandex- Lining: 96% polyester, 4% spandex- Hand wash cold
- Shell, Lining, & Other contents: 100% polyester- Hand wash cold
- Shell 1: 95% polyester, 5% spandex- Shell 2: 90% nylon, 10% spandex- Lining: 100% polyester- Hand wash cold
- 97% polyester, 3% spandex- Hand wash cold
{'item': 'Bubble-Hem Babydoll Mini Dress', 'rayon': 77, 'nylon': 23}
{'item': 'Tiered Drawstring Maxi Skirt', 'polyester': 100}
{'item': 'Mesh Floral Print Midi Skirt', 'polyester': 94, 'spandex': 6}
{'item': 'Mesh Draped-Sleeve Mini Dress', 'polyester': 100}
{'item': 'Mesh Slip Mini Dress', 'polyester': 95, 'spandex': 5}
{'item': 'Lace-Trim Tube Mini Dress', 'polyester': 97, 'spandex': 3}
