# Data scraping Sephora

## Setup

In [1]:
# import modules
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
#from http_request_randomizer.requests.proxy.requestProxy import RequestProxy
import time 
import pandas as pd
import numpy as np
import rootpath
import math
import random as rd
import dill  

# Set root path for project
path = rootpath.detect()

## Moisturizers
### pg 1

In [2]:
driver = webdriver.Chrome(f"{path}/chromedriver")

In [108]:
# Start up chrome driver
driver = webdriver.Chrome(f"{path}/chromedriver")

# Fetch product page link, pg 4
driver.get('https://www.ulta.com/skin-care-face-moisturizer?N=27h9Z1z13p3l&No=288&Nrpp=96')
    
# Extract all html link references for webpage
# Wait 5 seconds for page to load before extracting them
time.sleep(5)
item_list = driver.find_elements_by_xpath("/html/body/div[1]/div[6]/div[2]/div[2]/div[6]/div/div/ul//div[contains(@class, 'productQvContainer')]/a[@href]")
# Note from //div[contains] is the wildcard part that selects each individual product link

product_links = []
# Convert selenium refs info to href links and store them in a vector
for i, link in enumerate(item_list):
    # print(link.get_attribute('href'))
    # Fetch and store the links
    product_links.append(link.get_attribute('href'))

# Shuffle links so go through them in a random order
rd.shuffle(product_links)
    
# Create empty lists to store results in for each html element
brand_names = []
prod_names = []
prod_sizes = []
prod_prices = []
prod_details = []
prod_ingredientlists = []
prod_ratings = []
prod_respondrecs = []
prod_reviewtotals = []

# Type of product and skin
prod_type = 'face moisturizer'
prod_skin = 'normal'
prod_cat = 'moisturizer'
page_num = 4

In [109]:
# Iterate over links from webpage of products to extract text data
for link in product_links:
    # wait between 1 and 10 seconds before going to next link
    r_int = rd.randint(1,5)
    time.sleep(r_int)
    driver.get(link)
    # wait 5 seconds before scraping elements of webpage
    # e.g. allow it to load
    time.sleep(5)
    # Brand name
    brand_name = driver.find_elements_by_xpath("//*[@id='js-mobileBody']/div/div/div/div/div/div/section[1]/div[2]/div/h1/div[1]/a[@class]")[0].text
    brand_names.append(brand_name)
    # Product name
    prod_name = driver.find_elements_by_xpath("/html/body/div[1]/div[4]/div/div/div/div/div/div/section[1]/div[2]/div/h1/div[2]")[0].text
    prod_names.append(prod_name)
    # Product size
    # If else variant for whether product is one size or has multiple sizes
    if driver.find_elements_by_xpath("//*[contains(@class,'ProductDetail__productVariantOptions')]"):
        try:
            prod_size = driver.find_elements_by_xpath("//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'ProductMainSection__itemNumber', ' ' ))]")[0].text
        except (NoSuchElementException,IndexError):
            prod_size = math.nan
        finally:
            pass
    else:
        try:
            prod_size = driver.find_elements_by_xpath("//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'ProductMainSection__itemNumber', ' ' ))]")[0].text
        except (NoSuchElementException,IndexError):
            prod_size = math.nan
        finally:
            pass
    # Another option to print all sizes for product variant options
    # In reserve if above if statement does not work
    # driver.find_elements_by_xpath("//*[contains(@class,'ProductDetail__productVariantOptions')]")[0].text
    prod_sizes.append(prod_size)    
    # Product price
    prod_price = driver.find_elements_by_xpath("/html/body/div[1]/div[4]/div/div/div/div/div/div/section[1]/div[2]/div/div[contains(@class, 'ProductPricingPanel')]")[0].text
    prod_prices.append(prod_price)
    # Product details
    prod_detail = driver.find_elements_by_xpath("/html/body/div[1]/div[4]/div/div/div/div/div/div/section[2]/div/div[1]/div/div")[0].text
    prod_details.append(prod_detail)
    # Product ingredients
    try:
        prod_ingredientlist = driver.find_elements_by_xpath("/html/body/div[1]/div[4]/div/div/div/div/div/div/section[2]/div/div[3]/div[2]/div[2]/div/div/div | /html/body/div[1]/div[4]/div/div/div/div/div/div/section[2]/div/div[3]/div[2]/div[2]/div/div/div")[0].get_attribute("innerText")
    except (NoSuchElementException, IndexError): 
        prod_ingredientlist = math.nan
    prod_ingredientlists.append(prod_ingredientlist)
    # Product average rating
    # For this element and below, use webdriverwait to ensure elements have loaded
    # Include try and except for new products that don't have reviews
    try:
        WebDriverWait(driver,45).until(EC.presence_of_element_located((By.XPATH, "//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'pr-snippet-rating-decimal', ' ' ))] | /html/body/div[1]/div[4]/div/div/div/div/div/div/section[5]/div/div[2]/div[3]/div/section/header/section/div/div[1]/div/div[1]/div/div[2]")))
        prod_rating = driver.find_element_by_xpath("//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'pr-snippet-rating-decimal', ' ' ))] | /html/body/div[1]/div[4]/div/div/div/div/div/div/section[5]/div/div[2]/div[3]/div/section/header/section/div/div[1]/div/div[1]/div/div[2]").text
    except (NoSuchElementException,TimeoutException): 
        prod_rating = math.nan
    prod_ratings.append(prod_rating)
    # Product proportion of respondants who would recommend product to friends
    try:
        WebDriverWait(driver,45).until(EC.presence_of_element_located((By.XPATH, "//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'pr-reco-value', ' ' ))]")))
        prod_respondrec = driver.find_element_by_xpath("//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'pr-reco-value', ' ' ))]").text
    except (NoSuchElementException,TimeoutException): 
        prod_respondrec = math.nan
    finally:
        pass
    prod_respondrecs.append(prod_respondrec)
    # Product total number of reviews
    try:
        WebDriverWait(driver,45).until(EC.presence_of_element_located((By.XPATH, "//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'pr-snippet-review-count', ' ' ))]")))
        prod_reviewtotal = driver.find_element_by_xpath("//*[contains(concat( ' ', @class, ' ' ), concat( ' ', 'pr-snippet-review-count', ' ' ))]").text
    except (NoSuchElementException,TimeoutException): 
        prod_reviewtotal = math.nan
    finally:
        pass
    prod_reviewtotals.append(prod_reviewtotal)

In [106]:
# Create lists for 'use_category', 'use_subcategory', 'skintype', page
use_categories = []
for string in range(len(prod_names)):
    use_categories.append(f"{prod_cat}")

use_subcategory = []
for string in range(len(prod_names)):
    use_subcategory.append(f"{prod_type}")

skintype = []
for string in range(len(prod_names)):
    skintype.append(f"{prod_skin}")

# Now using all pages combined into one so it will be the same for all
page = []
for string in range(len(prod_names)):
    page.append(f"{page_num}")

In [110]:
# Combine product info lists into dataframe and export as CSV for pandas processing
df_pg1 = (pd.DataFrame(columns=['use_category', 'use_subcategory', 'skintype',
                            'brand','product','size', 'price', 'details', 
                            'ingredients', 'ratings', 'perc_respondrec', 'total_reviews', 'link', 'page'])) # creates master dataframe 

# list of each ingredient with ratings and categories paired
data_tuples = (list(zip(use_categories[1:],use_subcategory[1:],
                        skintype[1:], brand_names[1:],prod_names[1:],
                        prod_sizes[1:], prod_prices[1:], prod_details[1:],
                        prod_ingredientlists[1:], prod_ratings[1:],
                        prod_respondrecs[1:], prod_reviewtotals[1:],
                        product_links[1:], page[1:]))) 

# Create dataframe of tuple lists
temp_df = (pd.DataFrame(data_tuples,
                        columns=['use_category', 'use_subcategory', 'skintype',
                                 'brand','product','size', 'price', 'details', 
                                 'ingredients', 'ratings', 'perc_respondrec', 'total_reviews', 'link', 'page'])) # creates dataframe of each tuple in list
df_pg1 = df_pg1.append(temp_df)

# Export to csv
df_pg1.to_csv(f"{path}/data/{prod_cat}_{prod_type}_{prod_skin}skin_pg{page_num}.csv")