# Write function for scraping all brand websites

## Problem "Buckets" for Brand Sites
1. Materials break down has multiple parts with different percentages (bag vs. body)
2. Needs to access content from page source
2. Needs to wait for content to be visible 
3. Never finishes running

In [1]:
# imports
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

In [2]:
# create session
session = requests.Session()
# retry five times in case of exception
retry = Retry(connect=5, backoff_factor=0.5)
# apply delays between attempts
adapter = HTTPAdapter(max_retries=retry)
session.mount('https://', adapter)

In [3]:
# materials list based off materials calculator
materials = ['cotton', 'recycled cotton', 'organic cotton', 'polyester', 'recycled polyester', 'nylon', 'recycled nylon', 'acrylic',
             'spandex', 'flax', 'linen', 'hemp', 'cupro', 'lyocell', 'tencel', 'refibra', 'modal', 'tencel modal', 'viscose',
             'bamboo', 'lenzing viscose', 'ecovero', 'silk', 'alpaca', 'wool', 'recycled wool', 'cashmere', 'recycled Cashmere']

In [4]:
# function to scrape only materials in the materials list
def scrape_materials(url):
    try:
        # search for brand review
        review = session.get(url)
            
        # content
        content = BeautifulSoup(review.text, 'html.parser')

        # extract materials 
        norm_text = str(content).lower()

        made_of = {}
        for material in materials:
            # search for percentage plus material
            result = re.search('[\d.]+%\s+' + material, norm_text)
            # if result doesn't exist, check material plus any non-space characters
            if not result:
                result = re.search('[\d.]+%\s+' + material + '\S', norm_text)
            
            # if result exists, add to made_of list
            if result:
                item = re.split('%\s+', result.group(0))
                if '.' in item[0]:
                    percent = float(item[0]) 
                else:
                    percent = int(item[0]) 
                made_of[item[1]] = percent

        return made_of

    except Exception as e:
        return e

In [5]:
# function to scrape only materials in the materials list from page source
def scrape_page_source(url):
    try:
        # set chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        
        # initialize the webdriver
        with webdriver.Chrome(options=chrome_options) as driver:
            driver.get(url)
            
            # Get the page source after interactions
            page_source = driver.page_source
            
        # content
        content = BeautifulSoup(page_source, 'html.parser')

        # extract materials 
        norm_text = str(content).lower()

        made_of = {}
        for material in materials:
            # search for percentage plus material
            result = re.search('[\d.]+%\s+' + material, norm_text)
            # if result doesn't exist, check material plus any non-space characters
            if not result:
                result = re.search('[\d.]+%\s+' + material + '\S', norm_text)
            
            # if result exists, add to made_of list
            if result:
                item = re.split('%\s+', result.group(0))
                if '.' in item[0]:
                    percent = float(item[0]) 
                else:
                    percent = int(item[0]) 
                made_of[item[1]] = percent


        return made_of

    except Exception as e:
        return e

In [73]:
# princess polly test: success
print(scrape_materials("https://us.princesspolly.com/products/caruso-denim-wrap-skort-light-wash"))

{'cotton': 100}


In [74]:
# nike test: success
print(scrape_materials("https://www.nike.com/t/jordan-paris-saint-germain-pullover-hoodie-big-kids-hoodie-tpqdWh/45C566-023?nikemt=true&cp=75009274456_search_&gad_source=1&gclid=CjwKCAjwoPOwBhAeEiwAJuXRh47Ul92JN0-vlgv6RcIG0mKBklxgk4gs4zOPV8cguxRM7hK9eK_DThoCLUoQAvD_BwE"))

{'cotton': 80, 'polyester': 20}


In [75]:
# brandy test: success
print(scrape_materials("https://us.brandymelville.com/products/zelly-basic-top"))

{'cotton': 100}


In [76]:
# abercrombie test: failure - how do i scrape bag and body separately?
print(scrape_materials("https://bit.ly/3UlUXwr"))

{'cotton': 30, 'polyester': 70, 'lyocell': 22}


In [78]:
# amazon test: success - access materials from page source
print(scrape_page_source("https://www.amazon.com/Lyss%C3%A9-Womens-Schiffer-Button-Medium/dp/B01EWRKXTE/ref=lp_121173939011_1_2?pf_rd_p=53d84f87-8073-4df1-9740-1bf3fa798149&pf_rd_r=VN92AA0AZ4XSEQDKW4EJ"))

{'nylon': 75}


In [66]:
# asos test: failure - never finishes running
print(scrape_materials("https://www.asos.com/us/asos-design/asos-design-pleated-mini-skirt-in-gray/prd/205661997#colourWayId-205661998"))

In [19]:
# forever 21 test: failure - needs to wait for content to be visible
print(scrape_materials('https://www.forever21.com/us/20012622040501.html'))

<!doctype html>
<html lang="en-us"><head><title>just a moment...</title><meta content="text/html; charset=utf-8" http-equiv="content-type"/><meta content="ie=edge" http-equiv="x-ua-compatible"/><meta content="noindex,nofollow" name="robots"/><meta content="width=device-width,initial-scale=1" name="viewport"/><style>*{box-sizing:border-box;margin:0;padding:0}html{line-height:1.15;-webkit-text-size-adjust:100%;color:#313131}button,html{font-family:system-ui,-apple-system,blinkmacsystemfont,segoe ui,roboto,helvetica neue,arial,noto sans,sans-serif,apple color emoji,segoe ui emoji,segoe ui symbol,noto color emoji}@media (prefers-color-scheme:dark){body{background-color:#222;color:#d9d9d9}body a{color:#fff}body a:hover{color:#ee730a;text-decoration:underline}body .lds-ring div{border-color:#999 transparent transparent}body .font-red{color:#b20f03}body .big-button,body .pow-button{background-color:#4693ff;color:#1d1d1d}body #challenge-success-text{background-image:url(data:image/svg+xml;base

In [60]:
# american eagle test: failure - needs to wait for content to be visible 
print(scrape_page_source("https://www.ae.com/us/en/p/women/high-waisted-jeans/high-waisted-mom-jeans/ae-stretch-mom-jean/0436_4332_110?menu=cat4840004"))

<html lang="en"><head><script src="https://cdn.optimizely.com/js/24779750225.js" type="text/javascript"></script>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link as="script" href="https://www.ae.com/resources/b248977c5192a6d073ca760a2537f00c14d00114d3583" rel="preload"/><link as="script" href="/agwa-ugp-assets/vendor-c426a7473fbe2df0bca5babbed569334.js" integrity="sha256-/zjrqdwfde12oj1ycoizcbqtvwg3noyydelaewtzhee= sha512-ielyk7xkud1rlcv0vfm87zxb98rbbvmrfu+bfzkr+pbipcgtfcrpmiqgsts4mgzrwzjhvauw+sedyadndpcvgg==" rel="preload"/>
<link as="script" href="/agwa-ugp-assets/agwa-8fde92ee741bd06a2271b16c013edb9f.js" integrity="sha256-cuf6y0n7bjc1tcvbyiybg9qgup+k3cmljlxxk/uzko4= sha512-krxvaxii8+bcafjsekiuwv3eqhwzhalj6hqlm3ipgv1bbj9fg3qapxcqadrtlnasfx39mjjajxgj7win3tbs/g==" rel="preload"/>
<link as="font" crossorigin="" href="https://www.ae.com/assets/ae/fonts/f64c3634-1027-4aec-8bd4-d96f2444f559.woff2" rel="preload" type="font/woff2"/>
<link a

In [82]:
# alo test: success
print(scrape_materials("https://www.aloyoga.com/products/w9538r-airbrush-stream-lined-bra-tank-ivory-black?variant=41346126512308&disableCurrencyEstimate&gad_source=1&gclid=CjwKCAjww_iwBhApEiwAuG6ccL77JcerRoX7_JAHcpKdcfgONVm-BMBBTVNxENk7-9cIIDnlReDxGxoCaegQAvD_BwE"))

{'nylon': 87}


In [83]:
# reformation test: success
print(scrape_materials("https://www.thereformation.com/products/vida-low-rise-pant/1314838BLK.html?dwvar_1314838BLK_color=BLK&_gl=1*13y89uh*_up*MQ..&gclid=CjwKCAjww_iwBhApEiwAuG6ccHuReQ50wHf1NGMlZG5XTJoqFuugi5BXX5mlzOcjB35u29kXoUvEwBoCbf4QAvD_BwE&atc_confirmation=true"))

{'polyester': 72, 'spandex': 3, 'viscose': 5, 'wool': 5}


In [84]:
# acne studios test: success
print(scrape_materials("https://www.acnestudios.com/us/en/wool-mohair-scarf---narrow-lavender-purple/CA0290-ADH.html?g=woman"))

{'nylon': 22, 'alpaca': 33, 'wool': 25}


In [85]:
# alice and olivia test: success
print(scrape_materials("https://www.aliceandolivia.com/rosalee-tie-strap-bustier-maxi-dress/CC404P22523G123.html?lang=default"))

{'cotton': 97}


In [86]:
# sandy liang test: success
print(scrape_materials("https://www.sandyliang.info/products/primo-dress"))

{'cotton': 35, 'polyester': 65}


In [87]:
# billabong test: success
print(scrape_materials("https://www.billabong.com/collections/womens-clothing-tops/products/faye-knit-1"))

{'cotton': 49, 'polyester': 46}


In [88]:
# boohoo test: success
print(scrape_materials("https://us.boohoo.com/lapel-crop-blazer/DZZ52779.html?color=123&_gl=1*m20h85*_up*MQ..*_ga*MjAyMzc4NTAyNi4xNzEzMzk2ODY5*_ga_CKX55DLD7G*MTcxMzM5Njg2OC4xLjEuMTcxMzM5Njk1My4wLjAuMA.."))

{'polyester': 95}


In [89]:
# nasty gal test: success
print(scrape_materials("https://www.nastygal.com/diamante-denim-wide-leg-jeans/BGG21619.html?_gl=1*dqvdhd*_up*MQ..*_ga*MTU0NzA2MjcuMTcxMzM5NzA0MA..*_ga_YB0PXWCT3D*MTcxMzM5NzAzOS4xLjEuMTcxMzM5NzA1NS4wLjAuMA.."))

{'cotton': 79, 'polyester': 11, 'viscose': 10}


In [20]:
# north face test: success ('climate conscious cotton' is not a material in the calculator)
print(scrape_materials("https://www.thenorthface.com/en-us/womens/womens-tops/womens-t-shirts-c213341/womens-short-sleeve-heavyweight-tee-pNF0A88E3?color=LK5"))

[]


In [27]:
# skims test: failure - needs to wait for content to be visible 
print(scrape_materials("https://skims.com/products/boyfriend-loose-boxer-lily"))

<!doctype html>
<meta data-flight='s2:"react.suspense"
j0:["@1",["$","$2",null,{"fallback":null,"children":"@3"}],null]
'/><!doctype html>
<html lang="en"><head>
<meta charset="utf-8"/>
<link href="https://cdn.shopify.com/oxygen-v2/26957/12054/24730/491077/favicon.ico" rel="icon" type="image/svg+xml"/>
<meta content="width=device-width, initial-scale=1.0, viewport-fit=cover" name="viewport"/>
<meta content="max-image-preview:large" name="robots"/>
<title>skims</title>
<link href="//cdn.dynamicyield.com" rel="preconnect"/>
<link href="//st.dynamicyield.com" rel="preconnect"/>
<link href="//rcom.dynamicyield.com" rel="preconnect"/>
<link href="//cdn.dynamicyield.com" rel="dns-prefetch"/>
<link href="//st.dynamicyield.com" rel="dns-prefetch"/>
<link href="//rcom.dynamicyield.com" rel="dns-prefetch"/>
<!-- onetrust cookies consent notice start for skims.com -->
<script type="text/javascript">
    // if (window.location.href.includes('skims.dev')) {
    //   let otautoblock = document.creat

In [90]:
# good american test: success
print(scrape_materials("https://www.goodamerican.com/products/good-legs-deep-v-clean-hem-blue609?Inseam=Regular"))

{'organic cotton': 98.5}


In [91]:
# hollister test: failure - how do i scrape bag and body separately?
print(scrape_materials("https://www.hollisterco.com/shop/us/p/low-rise-dark-wash-baggy-jeans-56195470?categoryId=12552&faceout=model&seq=0"))

{'cotton': 20, 'polyester': 80}


In [92]:
# theory test: success
print(scrape_materials("https://www.theory.com/treeca-pant-in-good-wool/H0101234_EA2.html"))

{'wool': 96}


In [38]:
# mango test: failure - needs to wait for content to be visible
print(scrape_materials("https://shop.mango.com/us/women/blazers-blazers/100-linen-suit-blazer_67077112.html?c=99"))

<!doctype html>

<html class="ficha" dir="ltr" lang="en" xmlns="http://www.w3.org/1999/xhtml"><head>
<title>100% linen suit blazer -  women | mango usa</title>
<meta content="100% linen straight design lapel-collar v-neck collar long sleeve with buttoned cuffs two side pockets with flaps button fastening on the front back-slit hem inner lining co-ord plus size available " name="description"/>
<meta content="100% linen suit blazer -  women | mango usa" property="og:title"/>
<meta content="article" property="og:type"/>
<meta content="https://st.mngbcn.com/rcs/pics/static/t6/fotos/s6/67077112_99.jpg?ts=1711460364927" property="og:image"/>
<meta content="https://shop.mango.com/us/women/blazers-blazers/100-linen-suit-blazer_67077112.html?c=99" property="og:url"/>
<meta content="mango" property="og:site_name"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0, user-scalable=no, minimal-ui" name="viewport"/>
<meta content="mango mng" name="author"/>
<

In [93]:
# patagonia test: failure - breaks writing conventions, different percentages per part
print(scrape_materials('https://www.patagonia.com/product/mens-classic-retro-x-fleece-jacket/195699845640.html?s_kwcid=17928&utm_source=google&utm_medium=cpc&utm_campaign=BB_Ecomm_Shopping_ALL_WBSP_SaleKWs&gad_source=1&gclid=CjwKCAjw5v2wBhBrEiwAXDDoJd8XQGUW_sof8fEfoDRQUdlBPIlrxyB6fyngvB0--73eUmTbzjhBZBoCa84QAvD_BwE'))

{'polyester': 100, 'recycled polyester': 100, 'recycled nylon': 100}


In [7]:
# victoria's secret/pink test: success
print(scrape_materials('https://www.victoriassecret.com/us/pink/apparel-catalog/5000009721?brand=pink&collectionId=fe0131b4-96f7-46f6-88a2-3325ba6ed6f5&limit=180&orderBy=REC&priceType=regular&productId=77c4ee8f-ab11-48fa-8991-4e8acd67bb04&stackId=3db62f22-b572-48db-88b2-39a3259392ba&genericId=11243247&choice=3Z3G&product_position=1&stack_position=1&dataSource=manual-collection'))

{'cotton': 76, 'polyester': 24}


In [13]:
# primark test: success
print(scrape_materials('https://www.primark.com/en-us/p/paula-echevarria-tiered-prairie-midi-dress-black-991099551804'))

{'cotton': 100}


In [98]:
# yes friends test: success
print(scrape_materials('https://yesfriends.co/products/womens-fairtrade-organic-t-shirt?variant=44190218125526'))

{'organic cotton': 100}


In [99]:
# adidas test: failure - never finishes running
print(scrape_materials('https://www.adidas.com/us/manchester-united-23-24-home-jersey/IP1726.html'))

In [6]:
# aritzia test: success
print(scrape_materials('https://www.aritzia.com/us/en/product/raval-skirt/111879.html?dwvar_111879_color=31979'))

{'recycled polyester': 100}


In [7]:
# uniqlo test: failure - never finishes running
print(scrape_materials('https://www.uniqlo.com/ca/en/products/E464098-000?colorCode=COL70&sizeCode=KAG120'))

In [14]:
# area test: success
print(scrape_materials('https://area.nyc/collections/dresses/products/jumbo-crystal-denim-mini-dress?variant=45439476924648'))

{'cotton': 100}


In [None]:
# balenciaga test: skipped, no percentages

In [None]:
# bottega veneta test: skipped, no percentages

In [17]:
# brooks brothers test: failed - needs to wait for content to be visible
print(scrape_materials('https://www.brooksbrothers.com/paisley-floral-belted-shirt-dress-in-cotton/WX00839.html?dwvar_WX00839_Color=OWHT'))

<!doctype html>
<html lang="en-us"><head><title>just a moment...</title><meta content="text/html; charset=utf-8" http-equiv="content-type"/><meta content="ie=edge" http-equiv="x-ua-compatible"/><meta content="noindex,nofollow" name="robots"/><meta content="width=device-width,initial-scale=1" name="viewport"/><style>*{box-sizing:border-box;margin:0;padding:0}html{line-height:1.15;-webkit-text-size-adjust:100%;color:#313131}button,html{font-family:system-ui,-apple-system,blinkmacsystemfont,segoe ui,roboto,helvetica neue,arial,noto sans,sans-serif,apple color emoji,segoe ui emoji,segoe ui symbol,noto color emoji}@media (prefers-color-scheme:dark){body{background-color:#222;color:#d9d9d9}body a{color:#fff}body a:hover{color:#ee730a;text-decoration:underline}body .lds-ring div{border-color:#999 transparent transparent}body .font-red{color:#b20f03}body .big-button,body .pow-button{background-color:#4693ff;color:#1d1d1d}body #challenge-success-text{background-image:url(data:image/svg+xml;base

In [23]:
# burberry test: failed - how do i scrape bag and packaging separately?
print(scrape_materials('https://us.burberry.com/ekd-cotton-t-shirt-p80905481'))

<!doctype html>

<html data-application="browse" data-device="desktop" lang="en">
<head>
<title data-rh="true">ekd cotton t-shirt in salt - women | burberry® official</title>
<meta charset="utf-8"/>
<meta content="ie=edge,chrome=1" http-equiv="x-ua-compatible"/>
<meta content="width=device-width,initial-scale=1,user-scalable=no" name="viewport"/>
<link href="/nrws/browse/b67f8906bf88f31e6eb4456434ae5eb754565069/css/main.6d9af35e5a8951d5f745.css" rel="stylesheet" type="text/css"/>
<link href="/nrws/browse/b67f8906bf88f31e6eb4456434ae5eb754565069/css/chunks/vendor.954d7c80bdb62c1ed224.css" rel="stylesheet" type="text/css"/>
<link href="/nrws/browse/b67f8906bf88f31e6eb4456434ae5eb754565069/css/chunks/boss.d7fe9aa77948a4b55745.css" rel="stylesheet" type="text/css"/>
<link href="/nrws/browse/b67f8906bf88f31e6eb4456434ae5eb754565069/css/chunks/page-type-product-details.a4b606e50696140624b0.css" rel="stylesheet" type="text/css"/>
<link as="script" fetchpriority="low" href="/nrws/browse/b67f89

In [None]:
# coach: skipped, no percentages

In [24]:
# fendi: success
print(scrape_materials('https://www.fendi.com/us-en/woman/ready-to-wear/t-shirts-sweatshirts/sweater-light-blue-viscose-sweater-fzx999aq41f08s8'))


<!doctype html>

<html dir="" lang="en">
<head>
<link as="image" fetchpriority="high" href="https://static.fendi.com/dam/is/image/fendi/fzx999aq41f08s8_01?wid=1000&amp;hei=1000" rel="preload"/>
<link href="https://www.fendi.com/it-en/ck-static/favicon/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
<link href="https://www.fendi.com/it-en/ck-static/favicon/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
<link href="https://www.fendi.com/it-en/ck-static/favicon/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
<link href="https://www.fendi.com/it-en/ck-static/favicon/site.webmanifest" rel="manifest"/>
<link color="#000000" href="https://www.fendi.com/it-en/ck-static/favicon/safari-pinned-tab.svg" rel="mask-icon"/>
<link href="https://www.fendi.com/it-en/ck-static/favicon/favicon.ico" rel="shortcut icon"/>
<meta content="#000000" name="msapplication-tilecolor"/>
<meta content="/it-en/ck-static/favicon/browserconfig.xml" name="msapplication-config

In [25]:
# hermes test: failure, ai detection
print(scrape_materials('https://www.hermes.com/us/en/product/shirt-H4E0601DV9036/'))

<html>
<head><title>403 forbidden</title></head>
<body>
<center><h1>403 forbidden</h1></center>
</body>
</html>

{}


In [26]:
# louis vuitton test: never finishes running
print(scrape_materials('https://us.louisvuitton.com/eng-us/products/vintage-effect-louis-vuitton-t-shirt-nvprod5280041v/1AFMP5'))

In [6]:
# saint laurent test:
print(scrape_materials('https://www.ysl.com/en-us/dresses-and-skirts/backless-halter-dress-in-jersey-776601Y37NW1000.html'))

{'viscose': 80, 'wool': 14}
