## Write function for scraping all brand websites

## Problem "Buckets"
1. Materials break down has multiple parts with different percentages (bag vs. body)
2. Needs to wait for content to be visible 
3. Never finishes running

In [1]:
# imports
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

In [2]:
# create session
session = requests.Session()
# retry five times in case of exception
retry = Retry(connect=5, backoff_factor=0.5)
# apply delays between attempts
adapter = HTTPAdapter(max_retries=retry)
session.mount('https://', adapter)

In [3]:
# materials list based off materials calculator
materials = ['cotton', 'recycled cotton', 'organic cotton', 'polyester', 'recycled polyester', 'nylon', 'recycled nylon', 'acrylic',
             'spandex', 'flax', 'linen', 'hemp', 'cupro', 'lyocell', 'tencel', 'refibra', 'modal', 'tencel modal', 'viscose',
             'bamboo', 'lenzing viscose', 'ecovero', 'silk', 'alpaca', 'wool', 'recycled wool', 'cashmere', 'recycled Cashmere']

In [5]:
# function to scrape only materials in the materials list
def scrape_materials(url):
    try:
        # search for brand review
        review = session.get(url)
            
        # content
        content = BeautifulSoup(review.text, 'html.parser')

        # extract materials 
        norm_text = str(content).lower()
        # print(norm_text)

        made_of = []
        for material in materials:
            # search for percentage plus material
            result = re.search('[1234567890.]+%\s+' + material, norm_text)
            # if result doesn't exist, check material plus any non-space characters
            if not result:
                result = re.search('[1234567890.]+%\s+' + material + '\S', norm_text)
            
            # if result exists, add to made_of list
            if result:
                made_of.append(result.group(0))    

        return made_of

    except Exception as e:
        return e

In [44]:
# princess polly test: success
print(scrape_materials("https://us.princesspolly.com/products/caruso-denim-wrap-skort-light-wash"))

['100% cotton']


In [45]:
# nike test: success
print(scrape_materials("https://www.nike.com/t/jordan-paris-saint-germain-pullover-hoodie-big-kids-hoodie-tpqdWh/45C566-023?nikemt=true&cp=75009274456_search_&gad_source=1&gclid=CjwKCAjwoPOwBhAeEiwAJuXRh47Ul92JN0-vlgv6RcIG0mKBklxgk4gs4zOPV8cguxRM7hK9eK_DThoCLUoQAvD_BwE"))

['80% cotton', '20% polyester']


In [46]:
# brandy test: success
print(scrape_materials("https://us.brandymelville.com/products/zelly-basic-top"))

['100% cotton']


In [8]:
# abercrombie test: failure - how do i scrape bag and body separately?
print(scrape_materials("https://bit.ly/3UlUXwr"))


<!doctype html>

<html data-storeid="10051" lang="en">
<head>
<script>
			var productcatalog = {}, productprices = {};
		</script>
<script>try {productprices = productprices || {};productprices[56088331] = {"productid":"56088331","items":{"658277225":{"listpricefmt":"$70","priceflag":"1","itemid":"56089981","offerprice":70.0,"listpriceusd":70.0,"offerpricefmt":"$70","listprice":70.0},"658277313":{"listpricefmt":"$70","priceflag":"1","itemid":"56089990","offerprice":70.0,"listpriceusd":70.0,"offerpricefmt":"$70","listprice":70.0},"658277305":{"listpricefmt":"$70","priceflag":"1","itemid":"56089989","offerprice":70.0,"listpriceusd":70.0,"offerpricefmt":"$70","listprice":70.0},"658277348":{"listpricefmt":"$70","priceflag":"1","itemid":"56089993","offerprice":70.0,"listpriceusd":70.0,"offerpricefmt":"$70","listprice":70.0},"658277233":{"listpricefmt":"$70","priceflag":"1","itemid":"56089982","offerprice":70.0,"listpriceusd":70.0,"offerpricefmt":"$70","listprice":70.0},"658277321":{"listpr

In [65]:
# amazon test: failure - can't access page source after interactions
print(scrape_materials("https://www.amazon.com/Lyss%C3%A9-Womens-Schiffer-Button-Medium/dp/B01EWRKXTE/ref=lp_121173939011_1_2?pf_rd_p=53d84f87-8073-4df1-9740-1bf3fa798149&pf_rd_r=VN92AA0AZ4XSEQDKW4EJ"))

<!doctype html>

<html>
<head>
<meta charset="utf-8"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
<meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
<title>
      503 - service unavailable error</title>
<style>
      html,body{padding:0;margin:0}img{border:0}#a{background:#232f3e;padding:11px 11px 11px 192px}#b{position:absolute;left:22px;top:12px}#c{position:relative;max-width:800px;padding:0 40px 0 0}#e,#f{height:35px;border:0;font-size:1em}#e{width:100%;margin:0;padding:0 10px;border-radius:4px 0 0 4px}#f{cursor:pointer;background:#febd69;font-weight:bold;border-radius:0 4px 4px 0;-webkit-appearance:none;position:absolute;top:0;right:0;padding:0 12px}@media(max-width:500px){#a{padding:55px 10px 10px}#b{left:6px}}#g{text-align:center;margin:30px 0}#g img{max-width:90%}#d{display:none}#d[src]{display:inline}
    </style>
</head>
<body>
<!--
        to discuss automated access to amazon data please contact api-services-support@amazon.com

In [66]:
# asos test: failure - never finishes running
print(scrape_materials("https://www.asos.com/us/asos-design/asos-design-pleated-mini-skirt-in-gray/prd/205661997#colourWayId-205661998"))

In [6]:
# forever 21 test: failure - needs to wait for content to be visible
print(scrape_materials('https://www.forever21.com/us/20012622040501.html'))

<!doctype html>
<html lang="en-us"><head><title>just a moment...</title><meta content="text/html; charset=utf-8" http-equiv="content-type"/><meta content="ie=edge" http-equiv="x-ua-compatible"/><meta content="noindex,nofollow" name="robots"/><meta content="width=device-width,initial-scale=1" name="viewport"/><style>*{box-sizing:border-box;margin:0;padding:0}html{line-height:1.15;-webkit-text-size-adjust:100%;color:#313131}button,html{font-family:system-ui,-apple-system,blinkmacsystemfont,segoe ui,roboto,helvetica neue,arial,noto sans,sans-serif,apple color emoji,segoe ui emoji,segoe ui symbol,noto color emoji}@media (prefers-color-scheme:dark){body{background-color:#222;color:#d9d9d9}body a{color:#fff}body a:hover{color:#ee730a;text-decoration:underline}body .lds-ring div{border-color:#999 transparent transparent}body .font-red{color:#b20f03}body .big-button,body .pow-button{background-color:#4693ff;color:#1d1d1d}body #challenge-success-text{background-image:url(data:image/svg+xml;base

In [59]:
# american eagle test: failure - needs to wait for content to be visible 
print(scrape_materials("https://www.ae.com/us/en/p/women/high-waisted-jeans/high-waisted-mom-jeans/ae-stretch-mom-jean/0436_4332_110?menu=cat4840004"))

<html><head>
<title>access denied</title>
</head><body>
<h1>access denied</h1>
 
you don't have permission to access "http://www.ae.com/us/en/p/women/high-waisted-jeans/high-waisted-mom-jeans/ae-stretch-mom-jean/0436_4332_110?" on this server.<p>
reference #18.964ddb17.1714618130.20fa953
<p>https://errors.edgesuite.net/18.964ddb17.1714618130.20fa953</p>
</p></body>
</html>

[]


In [62]:
# alo test: success
print(scrape_materials("https://www.aloyoga.com/products/w9538r-airbrush-stream-lined-bra-tank-ivory-black?variant=41346126512308&disableCurrencyEstimate&gad_source=1&gclid=CjwKCAjww_iwBhApEiwAuG6ccL77JcerRoX7_JAHcpKdcfgONVm-BMBBTVNxENk7-9cIIDnlReDxGxoCaegQAvD_BwE"))

['87% nylon']


In [63]:
# reformation test: success
print(scrape_materials("https://www.thereformation.com/products/vida-low-rise-pant/1314838BLK.html?dwvar_1314838BLK_color=BLK&_gl=1*13y89uh*_up*MQ..&gclid=CjwKCAjww_iwBhApEiwAuG6ccHuReQ50wHf1NGMlZG5XTJoqFuugi5BXX5mlzOcjB35u29kXoUvEwBoCbf4QAvD_BwE&atc_confirmation=true"))

['72% polyester', '3% spandex', '5% viscose', '5% wool']


In [10]:
# acne studios test: success
print(scrape_materials("https://www.acnestudios.com/us/en/wool-mohair-scarf---narrow-lavender-purple/CA0290-ADH.html?g=woman"))

['22% nylon', '33% alpaca', '25% wool']


In [11]:
# alice and olivia test: success
print(scrape_materials("https://www.aliceandolivia.com/rosalee-tie-strap-bustier-maxi-dress/CC404P22523G123.html?lang=default"))

['97% cotton']


In [12]:
# sandy liang test: success
print(scrape_materials("https://www.sandyliang.info/products/primo-dress"))

['35% cotton', '65% polyester']


In [13]:
# billabong test: success
print(scrape_materials("https://www.billabong.com/collections/womens-clothing-tops/products/faye-knit-1"))

['49% cotton', '46% polyester']


In [14]:
# boohoo test: success
print(scrape_materials("https://us.boohoo.com/lapel-crop-blazer/DZZ52779.html?color=123&_gl=1*m20h85*_up*MQ..*_ga*MjAyMzc4NTAyNi4xNzEzMzk2ODY5*_ga_CKX55DLD7G*MTcxMzM5Njg2OC4xLjEuMTcxMzM5Njk1My4wLjAuMA.."))

['95% polyester']


In [15]:
# nasty gal test: success
print(scrape_materials("https://www.nastygal.com/diamante-denim-wide-leg-jeans/BGG21619.html?_gl=1*dqvdhd*_up*MQ..*_ga*MTU0NzA2MjcuMTcxMzM5NzA0MA..*_ga_YB0PXWCT3D*MTcxMzM5NzAzOS4xLjEuMTcxMzM5NzA1NS4wLjAuMA.."))

['79% cotton', '11% polyester', '10% viscose']


In [20]:
# north face test: success ('climate conscious cotton' is not a material in the calculator)
print(scrape_materials("https://www.thenorthface.com/en-us/womens/womens-tops/womens-t-shirts-c213341/womens-short-sleeve-heavyweight-tee-pNF0A88E3?color=LK5"))

[]


In [27]:
# skims test: failure - needs to wait for content to be visible 
print(scrape_materials("https://skims.com/products/boyfriend-loose-boxer-lily"))

<!doctype html>
<meta data-flight='s2:"react.suspense"
j0:["@1",["$","$2",null,{"fallback":null,"children":"@3"}],null]
'/><!doctype html>
<html lang="en"><head>
<meta charset="utf-8"/>
<link href="https://cdn.shopify.com/oxygen-v2/26957/12054/24730/491077/favicon.ico" rel="icon" type="image/svg+xml"/>
<meta content="width=device-width, initial-scale=1.0, viewport-fit=cover" name="viewport"/>
<meta content="max-image-preview:large" name="robots"/>
<title>skims</title>
<link href="//cdn.dynamicyield.com" rel="preconnect"/>
<link href="//st.dynamicyield.com" rel="preconnect"/>
<link href="//rcom.dynamicyield.com" rel="preconnect"/>
<link href="//cdn.dynamicyield.com" rel="dns-prefetch"/>
<link href="//st.dynamicyield.com" rel="dns-prefetch"/>
<link href="//rcom.dynamicyield.com" rel="dns-prefetch"/>
<!-- onetrust cookies consent notice start for skims.com -->
<script type="text/javascript">
    // if (window.location.href.includes('skims.dev')) {
    //   let otautoblock = document.creat

In [31]:
# good american test: success
print(scrape_materials("https://www.goodamerican.com/products/good-legs-deep-v-clean-hem-blue609?Inseam=Regular"))

['98.5% organic cotton']


In [32]:
# hollister test: failure - how do i scrape bag and body separately?
print(scrape_materials("https://www.hollisterco.com/shop/us/p/low-rise-dark-wash-baggy-jeans-56195470?categoryId=12552&faceout=model&seq=0"))

['100% cotton', '80% polyester']


In [33]:
# theory test: success
print(scrape_materials("https://www.theory.com/treeca-pant-in-good-wool/H0101234_EA2.html"))

['96% wool']


In [38]:
# mango test: failure - needs to wait for content to be visible
print(scrape_materials("https://shop.mango.com/us/women/blazers-blazers/100-linen-suit-blazer_67077112.html?c=99"))

<!doctype html>

<html class="ficha" dir="ltr" lang="en" xmlns="http://www.w3.org/1999/xhtml"><head>
<title>100% linen suit blazer -  women | mango usa</title>
<meta content="100% linen straight design lapel-collar v-neck collar long sleeve with buttoned cuffs two side pockets with flaps button fastening on the front back-slit hem inner lining co-ord plus size available " name="description"/>
<meta content="100% linen suit blazer -  women | mango usa" property="og:title"/>
<meta content="article" property="og:type"/>
<meta content="https://st.mngbcn.com/rcs/pics/static/t6/fotos/s6/67077112_99.jpg?ts=1711460364927" property="og:image"/>
<meta content="https://shop.mango.com/us/women/blazers-blazers/100-linen-suit-blazer_67077112.html?c=99" property="og:url"/>
<meta content="mango" property="og:site_name"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0, user-scalable=no, minimal-ui" name="viewport"/>
<meta content="mango mng" name="author"/>
<

In [41]:
# patagonia test: failure - breaks writing conventions, different percentages per part
print(scrape_materials('https://www.patagonia.com/product/mens-classic-retro-x-fleece-jacket/195699845640.html?s_kwcid=17928&utm_source=google&utm_medium=cpc&utm_campaign=BB_Ecomm_Shopping_ALL_WBSP_SaleKWs&gad_source=1&gclid=CjwKCAjw5v2wBhBrEiwAXDDoJd8XQGUW_sof8fEfoDRQUdlBPIlrxyB6fyngvB0--73eUmTbzjhBZBoCa84QAvD_BwE'))

['100% polyester', '100% recycled polyester', '100% recycled nylon']


In [47]:
# victoria's secret/pink test: success
print(scrape_materials('https://www.victoriassecret.com/us/pink/apparel-catalog/5000009721?brand=pink&collectionId=fe0131b4-96f7-46f6-88a2-3325ba6ed6f5&limit=180&orderBy=REC&priceType=regular&productId=77c4ee8f-ab11-48fa-8991-4e8acd67bb04&stackId=3db62f22-b572-48db-88b2-39a3259392ba&genericId=11243247&choice=3Z3G&product_position=1&stack_position=1&dataSource=manual-collection'))

['76% cotton', '24% polyester']


In [48]:
# primark test: success
print(scrape_materials('https://www.primark.com/en-us/p/sleeveless-floral-playsuit-pink-991096807306'))

['99% polyester']


In [49]:
# yes friends test: success
print(scrape_materials('https://yesfriends.co/products/womens-fairtrade-organic-t-shirt?variant=44190218125526'))

['100% organic cotton']


In [50]:
# adidas test: failure - never finishes running
print(scrape_materials('https://www.adidas.com/us/manchester-united-23-24-home-jersey/IP1726.html'))

In [6]:
# aritzia test: success
print(scrape_materials('https://www.aritzia.com/us/en/product/raval-skirt/111879.html?dwvar_111879_color=31979'))

['100% recycled polyester']


In [7]:
# uniqlo test: failure - never finishes running
print(scrape_materials('https://www.uniqlo.com/ca/en/products/E464098-000?colorCode=COL70&sizeCode=KAG120'))