In [11]:
#Selenium imports here
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains


#Other imports
import os
import wget


import time

# 0. Preprocessing

## 0.1 Definitons

In [92]:
class Product:
    
  def __init__(this, name, brand, 
               price=0 , rating=None, loves=None, 
               skin_type=None, concerns=None, 
               all_ingredients=None, highlighted_ingredients=None,
               eco_info=None,
               usage=None):
    this.__name = name
    this.__brand = brand
    this.__price = price
    this.__rating = rating
    this.__loves = loves
    this.__skin_type = skin_type
    this.__concerns = concerns
    this.__all_ingredients = all_ingredients
    this.__highlighted_ingredients = highlighted_ingredients
    this.__eco_info = eco_info
    this.__usage = usage
    
  def __str__(self):  
    return "Product Description: \n  name = % s,\n  brand = % s" % (self.__name, self.__brand)
  
  def get_name(self):
    return self.__name

  def get_brand(self):
    return self.__brand

  def get_price(self):
    return self.__price

  def get_rating(self):
    return self.__rating

  def get_loves(self):
    return self.__loves

  def get_skin_type(self):
    return self.__skin_type

  def get_concerns(self):
    return self.__concerns

  def get_ingredients(self, all_of_them = True):
    return self.__all_ingredients if all_of_them else self.__highlighted_ingredients

  def get_eco_info(self):
    return self.__eco_info

  def get_usage(self):
    return self.__usage





    


## 0.2 Setting up the browser's driver

In [70]:
# connect the webdriver to our notebook
chrome_path = "/Users/balapan/Downloads/chromedriver"
driver = webdriver.Chrome(executable_path = chrome_path)

# get and open the needed webpage 
driver.get("https://www.sephora.com/")

## 0.3 Navigation to the targeted page - All Skincare 

In [69]:
# sephora's website layout of the navigation bar changes depending on the size of the window
try:
    # if in the full screen mode
    skincare = WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[id = 'top_nav_drop_3_trigger']"))).click()
    all_skincare = driver.find_element_by_xpath('//*[@id="top_nav_drop_3_trigger"]')
    all_skincare.click()
except:
    # if the screen is small
    time.sleep(5)
    xpath = "/html/body/div[1]/div[1]/header/div[3]/div/a[4]"
    all_skincare = driver.find_element_by_xpath(xpath)
    all_skincare.click()
# finally:
#     all_skincare.click()

In [14]:
# close the sign in page by clicking on the cross sign
cross = WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label = 'Continue shopping']"))).click()


## 0.4 Getting necessary subpages 

In [68]:
# choose only the skincare categories that we are interested in 
tickers = ['Moisturizers', 'Cleansers', 'Treatments', 'Eye Care', 'Masks', 'Sun Care', 'Lip Treatments']

# for testing purposes, shorten the list
# tickers = tickers[:1]
tickers

['Moisturizers',
 'Cleansers',
 'Treatments',
 'Eye Care',
 'Masks',
 'Sun Care',
 'Lip Treatments']

In [17]:
# all categories we are interested in are in the same css class
# use the class name to locate and save the class objects
category_class = "css-10wlsyd.e65zztl0"
class_objects = driver.find_elements_by_class_name(category_class)

# check that we collected the right headers
class_text = [c.text for c in class_objects]
class_text

['Just Arrived',
 'Value & Gift Sets',
 'Mini Size',
 'Clean Skincare',
 'Shop by Concern',
 'Moisturizers',
 'Cleansers',
 'Treatments',
 'Wellness',
 'Eye Care',
 'Masks',
 'High Tech Tools',
 'Sun Care',
 'Self Tanners',
 'Lip Treatments',
 'Vegan Skincare']

In [18]:
# only choose the ones that we are interested in 
categories = [x for x in class_objects if x.text in tickers]
links = []

# check their names and compare to the tickers list
for m in categories:
    print(m.text)

Moisturizers
Cleansers
Treatments
Eye Care
Masks
Sun Care
Lip Treatments


# 1. Website Scraping

 ## category by category, product by product

In [21]:
# показ уведомлений хрома сбивает работу

In [26]:
categories[0].click()

In [28]:
# collect some products on the page
image_class = "css-1rovmyu.e65zztl0"

# products = [x for x in products if x.text in tickers]
# links = []

products = driver.find_elements_by_class_name(image_class)
products

[<selenium.webdriver.remote.webelement.WebElement (session="f5c3bcddebd6723424748b46b885f329", element="8c87a0ac-3bee-4a47-a891-bfa5733900ed")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f5c3bcddebd6723424748b46b885f329", element="0276ebeb-7a92-419b-80cf-cd91788582b2")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f5c3bcddebd6723424748b46b885f329", element="fed4eb0b-aeeb-4225-b458-472c8adb9941")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f5c3bcddebd6723424748b46b885f329", element="f0345286-9b9e-4ae4-9f7a-c5d929ff25b6")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f5c3bcddebd6723424748b46b885f329", element="e35a49da-fa76-4949-8f8b-b781ed8236ef")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f5c3bcddebd6723424748b46b885f329", element="b7e24530-9aae-457e-9a94-343659ecbbd9")>,
 <selenium.webdriver.remote.webelement.WebElement (session="f5c3bcddebd6723424748b46b885f329", element="04da2641-b754-48c4-affd-26

In [29]:
len(products)

12

In [30]:
products[0].click()

In [35]:
# get name, price and number of loves for a product
# css_name = 'span.css-57kn72'
# css_price = 'span[data-at="price"]'
# css_loves = 'span[data-at="product_love_count"]'

css_name  = 'span.css-1pgnl76.css-1pgnl76'
css_brand = 'a.css-nc375s.e65zztl0'
css_price = 'span.css-1lzahen'
css_loves = 'span.css-jk94q9'

name  = driver.find_element_by_css_selector(css_name).text
brand = driver.find_element_by_css_selector(css_brand).text
price = driver.find_element_by_css_selector(css_price).text
loves = driver.find_element_by_css_selector(css_loves).text

print(name, brand, price, loves)

Protini™ Polypeptide Moisturizer Drunk Elephant $68.00 307K


In [90]:
item = []
item.append(Product(name, brand, price, loves=loves))
print(item[0].get_name())

Protini™ Polypeptide Moisturizer


In [37]:
# # get rating

# keys = list(map(str, list(range(5,0,-1))))
# values = list(range(1,6))
# rating_dict = dict(zip(keys, values))


# def get_votes(v):
#     time.sleep(5)
# #     css_rating = '//*[@id="ratings-reviews"]/div[2]/div[2]/div[2]/table/tbody/tr['+ str(v)+']/td[3]'
#     css_rating = '//*[@id="ratings-reviews"]/div[2]/div[2]/div[2]/table/tbody/tr[1]/td[3]'
#     return int(driver.find_element_by_xpath(css_rating).text)
    

# rating = {k: get_votes(v) for k, v in rating_dict.items()}
# rating

In [38]:
def extract_number(string):
    number = ''
    for char in string:
        if char.isdigit():
            number+=(char)
    return float(number)


# get the overall rating/score 
css_rating = '//*[@id="ratings-reviews-container"]/div[2]/div[2]/div[1]/div/div[2]/div/span'
total_score = float(driver.find_element_by_xpath(css_rating).text)

# get total number of reviews 
css_review_count = '//*[@id="ratings-reviews-container"]/div[2]/div[2]/div[1]/div/div[2]/span'
total_reviews_count = driver.find_element_by_xpath(css_review_count).text
total_reviews_count = extract_number(total_reviews_count)


print(total_score, total_reviews_count)

4.0 5245.0


In [52]:
import re
def extract_highlights(highlights):
    start = 'Good for:'
    end = '\n'

    result = re.findall('%s(.*)%s' % (start, end), highlights)
    print(result)

# get highlights - info from the good for section
good_for = []
css_highlights = '/html/body/div[1]/div[2]/div/main/div/div[3]'
highlights = driver.find_element_by_xpath(css_highlights).text

print('Extracted full info\n')
print(highlights)

print('\nHighlights only\n')
good_for = extract_highlights(highlights)
good_for



Extracted full info

Good for: Loss of firmness
Good for: Dullness/Uneven Texture
Good for: Anti-Aging
Good for: Dryness
Clean at Sephora
Community Favorite

Highlights only

[' Loss of firmness', ' Dullness/Uneven Texture', ' Anti-Aging', ' Dryness']


In [53]:
css_about_product = '/html/body/div[1]/div[2]/div/main/div/div[5]/div[2]/div'
about_product = driver.find_element_by_xpath(css_about_product).text


def extract_skin_type(about):
    start = 'Skin Type: '
    end = '\n'
    
    result = re.findall('%s(.*)%s' % (start, end), about)[0]
    result = (result.replace(' and ', ',')).split(',')
    skin_types = list(filter(None, result))
    skin_types = [st.strip() for st in skin_types]
    return skin_types
      
skin_types = extract_skin_type(about_product)
skin_types

['Normal', 'Dry', 'Combination', 'Oily']

In [55]:
def extract_eco_info(about):
    global recyclable_packaging, cruelty_free, vegan, no_parabens
    start = 'Ingredient Callouts: '
    end = '\n'

    result = (re.findall('%s(.*)%s' % (start, end), about)[0]).lower()

    if 'recyclable' in result:
        recyclable_packaging = True
    
    if 'vegan' in result:
        vegan = True 
        
    if 'cruelty-free' in result:
        cruelty_free = True
    
    if 'parabens' in result:
        no_parabens = True
        
    print(vegan,cruelty_free,recyclable_packaging, no_parabens)
    
vegan = False
cruelty_free = False
recyclable_packaging = False
no_parabens = False

# for some reason need to assign about_product again as it changes 
css_about_product = '/html/body/div[1]/div[2]/div/main/div/div[5]/div[2]/div'
about_product = driver.find_element_by_xpath(css_about_product).text
extract_eco_info(about_product)


True True False False


In [57]:
highlighted_ingredients = []
all_ingredients = []

# get all ingredients and high ingredients
def get_ingredients():
    highlighted_ingredients = []
    all_ingredients = []
    conversational_words = ['are','is', 'in', 'with', 'to', 'please', 'must',  'be']
    conversational_words = list(map(lambda x: ' ' + x + ' ', conversational_words))
    
    xpath_button = '/html/body/div[1]/div[2]/div/main/div/button[1]'
    xpath_descr = '//*[@id="ingredients"]/div/div'
    
    ingredients_button = driver.find_element_by_xpath(xpath_button)
    ingredients_button.click()
    
    description = driver.find_element_by_xpath(xpath_descr).text
    description = description.split('\n\n')
    
    for paragraph in description: 
        if paragraph[0] == '-':
            start = '-'
            end = ':'
            highlighted_ingredients = re.findall('%s(.*)%s' % (start,end), paragraph)
        elif not(any(word in paragraph for word in conversational_words)):
            print(paragraph)
            print('')
            all_ingredients = (paragraph.split(', ')).copy()
            #remove dot at the end of the converted to list of ingredients paragraph
            if all_ingredients[-1][-1] == '.':
                all_ingredients[-1] = all_ingredients[-1][:-1]
                 
    return (all_ingredients, highlighted_ingredients)

all_ingredients, highlighted_ingredients  = get_ingredients()

Water/Aqua/Eau, Dicaprylyl Carbonate, Glycerin, Cetearyl Alcohol, Cetearyl Olivate, Sorbitan Olivate, Sclerocarya Birrea Seed Oil, Bacillus/Soybean/ Folic Acid Ferment Extract, Nymphaea Alba Root Extract, sh-Oligopeptide-1, sh-Oligopeptide-2, sh-Polypeptide-1, sh-Polypeptide-9, sh-Polypeptide-11, Copper Palmitoyl Heptapeptide-14, Heptapeptide-15 Palmitate, Palmitoyl Tetrapeptide-7, Palmitoyl Tripeptide-1, Alanine, Arginine, Glycine, Histidine, Isoleucine, Phenylalanine, Proline, Serine, Threonine, Valine, Acetyl Glutamine, Coconut Alkanes, Coco-Caprylate/Caprate, Sodium Hyaluronate, Aspartic Acid, Linoleic Acid, Linolenic Acid, Lecithin, Butylene Glycol, Polyvinyl Alcohol, Sodium Lactate, Sodium PCA, PCA, Sorbitan Isostearate, Carbomer, Polysorbate 20, Polysorbate 60, Lactic Acid/Glycolic Acid Copolymer, Hydroxyethyl Acrylate/Sodium Acryloyldimethyl Taurate Copolymer, Xanthan Gum, Isomalt, 1,2-Hexanediol, Caprylyl Glycol, Chlorphenesin, Phenoxyethanol, Tocopherol, Sodium Benzoate, Phen

In [58]:
highlighted_ingredients

['Signal Peptide Complex (Growth Factors)',
 'Pygmy Waterlily Stem Cell Extract',
 'Soybean Folic Acid Ferment Extract']

In [59]:
all_ingredients

['Water/Aqua/Eau',
 'Dicaprylyl Carbonate',
 'Glycerin',
 'Cetearyl Alcohol',
 'Cetearyl Olivate',
 'Sorbitan Olivate',
 'Sclerocarya Birrea Seed Oil',
 'Bacillus/Soybean/ Folic Acid Ferment Extract',
 'Nymphaea Alba Root Extract',
 'sh-Oligopeptide-1',
 'sh-Oligopeptide-2',
 'sh-Polypeptide-1',
 'sh-Polypeptide-9',
 'sh-Polypeptide-11',
 'Copper Palmitoyl Heptapeptide-14',
 'Heptapeptide-15 Palmitate',
 'Palmitoyl Tetrapeptide-7',
 'Palmitoyl Tripeptide-1',
 'Alanine',
 'Arginine',
 'Glycine',
 'Histidine',
 'Isoleucine',
 'Phenylalanine',
 'Proline',
 'Serine',
 'Threonine',
 'Valine',
 'Acetyl Glutamine',
 'Coconut Alkanes',
 'Coco-Caprylate/Caprate',
 'Sodium Hyaluronate',
 'Aspartic Acid',
 'Linoleic Acid',
 'Linolenic Acid',
 'Lecithin',
 'Butylene Glycol',
 'Polyvinyl Alcohol',
 'Sodium Lactate',
 'Sodium PCA',
 'PCA',
 'Sorbitan Isostearate',
 'Carbomer',
 'Polysorbate 20',
 'Polysorbate 60',
 'Lactic Acid/Glycolic Acid Copolymer',
 'Hydroxyethyl Acrylate/Sodium Acryloyldimethy

the useage_schedule is not a reliable factor
use_daily is difficult to assign as the word daily is often omitted 

In [60]:
def get_usage_schedule():
    use_daily = False
    usage = {
        'morning': False,
        'night': False
        }
    
    xpath_button = '/html/body/div[1]/div[2]/div/main/div/button[2]'
    xpath_descr = '//*[@id="howtouse"]/div/div'
    
    how_to_use = driver.find_element_by_xpath(xpath_button)
    how_to_use.click()
    
    usage_schedule = driver.find_element_by_xpath(xpath_descr).text.lower()
    usage_schedule = usage_schedule.split('\n\n')
    
    for paragraph in usage_schedule: 
        if 'suggested usage' in paragraph:
            if 'daily' in paragraph:
                use_daily = True
            if 'morning' in paragraph:
                usage['morning'] = True 
            if 'night' in paragraph:
                usage['night'] = True
            if 'twice a day' in paragraph: 
                usage['morning'] = True 
                usage['night'] = True
    return (use_daily, usage)
    
use_daily = False   
usage = {
    'morning': False,
    'night': False
}

use_daily, usage = get_usage_schedule()

In [61]:
use_daily

False

In [62]:
usage

{'morning': True, 'night': True}