In [279]:
#Selenium imports here
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains


#Other imports
import os
import wget
import time
import re
from pprint import pprint

# 0. Preprocessing

## 0.1 Definitons

In [246]:
class Product:
    def __init__(this, name, brand, price=0 , 
               rating=None,reviews_num=None, loves=None, 
               skin_type=None, concerns=None, 
               all_ingredients=None, highlighted_ingredients=None,
               eco_info=None,
               usage=None):
        this.__name = name
        this.__brand = brand
        this.__price = price
        this.__rating = rating
        this.__reviews_num = reviews_num
        this.__loves = loves
        this.__skin_type = skin_type
        this.__concerns = concerns
        this.__all_ingredients = all_ingredients
        this.__highlighted_ingredients = highlighted_ingredients
        this.__eco_info = eco_info
        this.__usage = usage

    def __str__(self):  
        return "Product Description: \n  name = % s,\n  brand = % s" % (self.__name, self.__brand)

    
    # getters
    def get_name(self):
        return self.__name

    def get_brand(self):
        return self.__brand

    def get_price(self):
        return self.__price

    def get_rating(self):
        return self.__rating
    
    def get_reviews_num(self):
        return self.__reviews_num

    def get_loves(self):
        return self.__loves

    def get_skin_type(self):
        return self.__skin_type

    def get_concerns(self):
        return self.__concerns

    def get_ingredients(self, all_of_them = True):
        return self.__all_ingredients if all_of_them else self.__highlighted_ingredients

    def get_eco_info(self):
        return self.__eco_info

    def get_usage(self):
        return self.__usage
    
    
    # setters
    def set_name(self, name):
        self.__name = name

    def set_brand(self, brand):
        self.__brand = brand

    def set_price(self, price):
        self.__price = price

    def set_rating(self, rating):
        self.__rating = rating
    
    def set_reviews_num(self, reviews_num):
        self.__reviews_num = reviews_num

    def set_loves(self, loves):
        self.__loves = loves

    def set_skin_type(self, skin_type):
        self.__skin_type = skin_type

    def set_concerns(self, concerns):
        self.__concerns = concerns

    def set_ingredients(self, ingredients, all_of_them = True):
        if all_of_them: 
            self.__all_ingredients = ingredients
        else:
            self.__highlighted_ingredients = ingredients

    def set_eco_info(self, eco_info):
        self.__eco_info = eco_info

    def set_usage(self, usage):
        self.__usage = usage

In [247]:
# # doesn't work for float numbers in a string 
# def extract_number(string):
#     number = ''
#     for char in string:
#         if char.isdigit():
#             number+=(char)
#     return float(number)

def parse_number(string):
    m = re.search(r"(\d+\.?\d*)", string)
    return m.group() if m else None

In [248]:
def extract_highlights(highlights):
    start = 'Good for:'
    end = '\n'

    result = re.findall('%s(.*)%s' % (start, end), highlights)
    return result

In [249]:
def extract_skin_type(about):
    start = 'Skin Type: '
    end = '\n'
    
    result = re.findall('%s(.*)%s' % (start, end), about)[0]
    result = (result.replace(' and ', ',')).split(',')
    skin_types = list(filter(None, result))
    skin_types = [st.strip() for st in skin_types]
    return skin_types

In [250]:
# this doesn't work properly, works from the second attempt... is the problem with
# clickling/not clicking on the show more button
def extract_eco_info(about):
    global recyclable_packaging, cruelty_free, vegan, no_parabens
    start = 'Ingredient Callouts: '
    end = '\n'
    
    try:
        xpath_show_more = '/html/body/div[1]/div[2]/div/main/div/div[5]/div[2]/button'
        show_more_btn = driver.find_element_by_xpath(xpath_show_more)
        show_more_btn.click()
        result = (re.findall('%s(.*)%s' % (start, end), about)[0]).lower()
    except:
        result = (re.findall('%s(.*)%s' % (start, end), about)[0]).lower()
    
    if 'recyclable' in result:
        recyclable_packaging = True
    
    if 'vegan' in result:
        vegan = True 
        
    if 'cruelty-free' in result:
        cruelty_free = True
    
    if 'parabens' in result:
        no_parabens = True
        
    return {'vegan': vegan,
          'cruelty_free': cruelty_free,
          'recyclable_packaging': recyclable_packaging, 
          'no_parabens': no_parabens}

In [251]:
# get all ingredients and high ingredients
def get_ingredients():
    highlighted_ingredients = []
    all_ingredients = []
    conversational_words = ['are','is', 'in', 'with', 'to', 'please', 'must',  'be']
    conversational_words = list(map(lambda x: ' ' + x + ' ', conversational_words))
    
    xpath_button = '/html/body/div[1]/div[2]/div/main/div/button[1]'
    xpath_descr = '//*[@id="ingredients"]/div/div'
    
    ingredients_button = driver.find_element_by_xpath(xpath_button)
    ingredients_button.click()
    
    description = driver.find_element_by_xpath(xpath_descr).text
    description = description.split('\n\n')
    
    for paragraph in description: 
        if paragraph[0] == '-':
            start = '-'
            end = ':'
            highlighted_ingredients = re.findall('%s(.*)%s' % (start,end), paragraph)
        elif not(any(word in paragraph for word in conversational_words)):
            print(paragraph)
            print('')
            all_ingredients = (paragraph.split(', ')).copy()
            #remove dot at the end of the converted to list of ingredients paragraph
            if all_ingredients[-1][-1] == '.':
                all_ingredients[-1] = all_ingredients[-1][:-1]
                 
    return (all_ingredients, highlighted_ingredients)

In [252]:
def get_usage_schedule():
    use_daily = False
    usage = {
        'morning': False,
        'night': False
        }
    
    xpath_button = '/html/body/div[1]/div[2]/div/main/div/button[2]'
    xpath_descr = '//*[@id="howtouse"]/div/div'
    
    how_to_use = driver.find_element_by_xpath(xpath_button)
    how_to_use.click()
    
    usage_schedule = driver.find_element_by_xpath(xpath_descr).text.lower()
    usage_schedule = usage_schedule.split('\n\n')
    
    for paragraph in usage_schedule: 
        if 'suggested usage' in paragraph:
            if 'daily' in paragraph:
                use_daily = True
            if 'morning' in paragraph:
                usage['morning'] = True 
            if 'night' in paragraph:
                usage['night'] = True
            if 'twice a day' in paragraph: 
                usage['morning'] = True 
                usage['night'] = True
    return (use_daily, usage)

## 0.2 Setting up the browser's driver

In [253]:
# connect the webdriver to our notebook
chrome_path = "/Users/balapan/Downloads/chromedriver"
driver = webdriver.Chrome(executable_path = chrome_path)

# get and open the needed webpage 
driver.get("https://www.sephora.com/")

## 0.3 Navigation to the targeted page - All Skincare 

In [254]:
# sephora's website layout of the navigation bar changes depending on the size of the window
try:
    # if in the full screen mode
    skincare = WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[id = 'top_nav_drop_3_trigger']"))).click()
    all_skincare = driver.find_element_by_xpath('//*[@id="top_nav_drop_3_trigger"]')
    all_skincare.click()
except:
    # if the screen is small
    time.sleep(5)
    xpath = "/html/body/div[1]/div[1]/header/div[3]/div/a[4]"
    all_skincare = driver.find_element_by_xpath(xpath)
    all_skincare.click()
# finally:
#     all_skincare.click()

In [255]:
# close the sign in page by clicking on the cross sign
cross = WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label = 'Continue shopping']"))).click()


## 0.4 Getting necessary subpages 

In [256]:
# choose only the skincare categories that we are interested in 
tickers = ['Moisturizers', 'Cleansers', 'Treatments', 'Eye Care', 'Masks', 'Sun Care', 'Lip Treatments']

# for testing purposes, shorten the list
# tickers = tickers[:1]
tickers

['Moisturizers',
 'Cleansers',
 'Treatments',
 'Eye Care',
 'Masks',
 'Sun Care',
 'Lip Treatments']

In [257]:
# all categories we are interested in are in the same css class
# use the class name to locate and save the class objects
category_class = "css-10wlsyd.e65zztl0"
class_objects = driver.find_elements_by_class_name(category_class)

# check that we collected the right headers
class_text = [c.text for c in class_objects]
class_text

['Just Arrived',
 'Value & Gift Sets',
 'Mini Size',
 'Clean Skincare',
 'Shop by Concern',
 'Moisturizers',
 'Cleansers',
 'Treatments',
 'Wellness',
 'Eye Care',
 'Masks',
 'High Tech Tools',
 'Sun Care',
 'Self Tanners',
 'Lip Treatments',
 'Vegan Skincare']

In [258]:
# only choose the ones that we are interested in 
categories = [x for x in class_objects if x.text in tickers]
links = []

# check their names and compare to the tickers list
for m in categories:
    print(m.text)

Moisturizers
Cleansers
Treatments
Eye Care
Masks
Sun Care
Lip Treatments


# 1. Website Scraping

 ## category by category, product by product

In [261]:
# показ уведомлений хрома сбивает работу

In [262]:
categories[0].click()

In [263]:
# collect some products on the page
image_class = "css-1rovmyu.e65zztl0"

# products = [x for x in products if x.text in tickers]
# links = []

products = driver.find_elements_by_class_name(image_class)
products

[<selenium.webdriver.remote.webelement.WebElement (session="58002439136069ec1ca0844434522b45", element="aace59de-3f79-43fe-801d-832fc3891bee")>,
 <selenium.webdriver.remote.webelement.WebElement (session="58002439136069ec1ca0844434522b45", element="eadd0205-f45c-4c48-871b-126718660fd1")>,
 <selenium.webdriver.remote.webelement.WebElement (session="58002439136069ec1ca0844434522b45", element="f22eb762-75ca-436e-8023-816b19dd5f4e")>,
 <selenium.webdriver.remote.webelement.WebElement (session="58002439136069ec1ca0844434522b45", element="bde82ecb-9b59-41b7-8c31-b7111c0393b3")>,
 <selenium.webdriver.remote.webelement.WebElement (session="58002439136069ec1ca0844434522b45", element="f6dc1adc-348c-4944-8ef7-006aa30296ef")>,
 <selenium.webdriver.remote.webelement.WebElement (session="58002439136069ec1ca0844434522b45", element="9b724662-17b2-4143-bbc1-801a30bc158d")>,
 <selenium.webdriver.remote.webelement.WebElement (session="58002439136069ec1ca0844434522b45", element="4dee4476-ad15-46fa-8d44-04

In [264]:
len(products)

12

In [265]:
products[0].click()

In [266]:
# get name, price and number of loves for a product
# css_name = 'span.css-57kn72'
# css_price = 'span[data-at="price"]'
# css_loves = 'span[data-at="product_love_count"]'

css_name  = 'span.css-1pgnl76.css-1pgnl76'
css_brand = 'a.css-nc375s.e65zztl0'
css_price = 'span.css-1lzahen'
css_loves = 'span.css-jk94q9'

name  = driver.find_element_by_css_selector(css_name).text
brand = driver.find_element_by_css_selector(css_brand).text
price = driver.find_element_by_css_selector(css_price).text
loves = driver.find_element_by_css_selector(css_loves).text

price = parse_number(price)
loves = parse_number(loves)

print(name, brand, price, loves)

Protini™ Polypeptide Moisturizer Drunk Elephant 68.00 307


In [267]:
item = []

item.append(Product(name, brand, price, loves=loves))

In [268]:
# get the overall rating/score 
css_rating = '//*[@id="ratings-reviews-container"]/div[2]/div[2]/div[1]/div/div[2]/div/span'
total_score = float(driver.find_element_by_xpath(css_rating).text)

# get total number of reviews 
css_review_count = '//*[@id="ratings-reviews-container"]/div[2]/div[2]/div[1]/div/div[2]/span'
total_reviews_count = driver.find_element_by_xpath(css_review_count).text
total_reviews_count = extract_number(total_reviews_count)


print(total_score, total_reviews_count)
item[0].set_rating(total_score)
item[0].set_reviews_num(total_reviews_count)

4.0 5246.0


In [269]:
# get highlights - info from the good for section
good_for = []
css_highlights = '/html/body/div[1]/div[2]/div/main/div/div[3]'
highlights = driver.find_element_by_xpath(css_highlights).text

good_for = extract_highlights(highlights)
good_for

item[0].set_concerns(good_for)
item[0].get_concerns()

[' Loss of firmness', ' Dullness/Uneven Texture', ' Anti-Aging', ' Dryness']

In [270]:
css_about_product = '/html/body/div[1]/div[2]/div/main/div/div[5]/div[2]/div'
about_product = driver.find_element_by_xpath(css_about_product).text

skin_types = extract_skin_type(about_product)
skin_types

item[0].set_skin_type(skin_types)
item[0].get_skin_type()

['Normal', 'Dry', 'Combination', 'Oily']

In [274]:
vegan = False
cruelty_free = False
recyclable_packaging = False
no_parabens = False

# for some reason need to assign about_product again as it changes 
css_about_product = '/html/body/div[1]/div[2]/div/main/div/div[5]/div[2]/div'
about_product = driver.find_element_by_xpath(css_about_product).text

eco_info = extract_eco_info(about_product)

item[0].set_eco_info(eco_info)
item[0].get_eco_info()

{'vegan': True,
 'cruelty_free': True,
 'recyclable_packaging': False,
 'no_parabens': False}

In [275]:
highlighted_ingredients = []
all_ingredients = []

all_ingredients, highlighted_ingredients  = get_ingredients()

item[0].set_ingredients(all_ingredients)
item[0].set_ingredients(highlighted_ingredients, all_of_them = False)

item[0].get_ingredients(all_of_them = True)
item[0].get_ingredients(all_of_them = False)

Water/Aqua/Eau, Dicaprylyl Carbonate, Glycerin, Cetearyl Alcohol, Cetearyl Olivate, Sorbitan Olivate, Sclerocarya Birrea Seed Oil, Bacillus/Soybean/ Folic Acid Ferment Extract, Nymphaea Alba Root Extract, sh-Oligopeptide-1, sh-Oligopeptide-2, sh-Polypeptide-1, sh-Polypeptide-9, sh-Polypeptide-11, Copper Palmitoyl Heptapeptide-14, Heptapeptide-15 Palmitate, Palmitoyl Tetrapeptide-7, Palmitoyl Tripeptide-1, Alanine, Arginine, Glycine, Histidine, Isoleucine, Phenylalanine, Proline, Serine, Threonine, Valine, Acetyl Glutamine, Coconut Alkanes, Coco-Caprylate/Caprate, Sodium Hyaluronate, Aspartic Acid, Linoleic Acid, Linolenic Acid, Lecithin, Butylene Glycol, Polyvinyl Alcohol, Sodium Lactate, Sodium PCA, PCA, Sorbitan Isostearate, Carbomer, Polysorbate 20, Polysorbate 60, Lactic Acid/Glycolic Acid Copolymer, Hydroxyethyl Acrylate/Sodium Acryloyldimethyl Taurate Copolymer, Xanthan Gum, Isomalt, 1,2-Hexanediol, Caprylyl Glycol, Chlorphenesin, Phenoxyethanol, Tocopherol, Sodium Benzoate, Phen

['Signal Peptide Complex (Growth Factors)',
 'Pygmy Waterlily Stem Cell Extract',
 'Soybean Folic Acid Ferment Extract']

the usage_schedule is not a reliable factor
use_daily is difficult to assign as the word daily is often omitted 

In [276]:
use_daily = False   
usage = {
    'morning': False,
    'night': False
}

use_daily, usage = get_usage_schedule()

item[0].set_usage(usage)
item[0].get_usage()

{'morning': True, 'night': True}

In [278]:
pprint(vars(item[0]))

{'_Product__all_ingredients': ['Water/Aqua/Eau',
                               'Dicaprylyl Carbonate',
                               'Glycerin',
                               'Cetearyl Alcohol',
                               'Cetearyl Olivate',
                               'Sorbitan Olivate',
                               'Sclerocarya Birrea Seed Oil',
                               'Bacillus/Soybean/ Folic Acid Ferment Extract',
                               'Nymphaea Alba Root Extract',
                               'sh-Oligopeptide-1',
                               'sh-Oligopeptide-2',
                               'sh-Polypeptide-1',
                               'sh-Polypeptide-9',
                               'sh-Polypeptide-11',
                               'Copper Palmitoyl Heptapeptide-14',
                               'Heptapeptide-15 Palmitate',
                               'Palmitoyl Tetrapeptide-7',
                               'Palmitoyl Tripeptide-1