# Capstone: Scraping Sephora Reviews

### Problem Statement

How should sephora group its customer base by its reviews? How do negative reviews impact whether or not an itme goes on sale

#### Collect Data

In [242]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from time import sleep
import json

In [243]:
url = 'https://www.sephora.com/shop/makeup-cosmetics'
res = requests.get(url)
res.status_code

200

In [244]:
res.content
soup = BeautifulSoup(res.content, 'lxml')

### Get top level category

In [245]:
categories = soup.find('nav',{'class':"css-1ofufz3"})

In [246]:
top_level_category = categories.find_all('a', {'data-at':'top_level_category'})

In [247]:
top_level_category[0]['href']

'/best-selling-makeup'

In [249]:
# Grab the links to each of the top level categories on the makeup-cosmetics page 
top_level_hrefs = []
for row in top_level_category:
    top_level_hrefs.append(row['href'])

In [251]:
# Specifically looking for just makeup products
top_level_hrefs[6:10]

['/shop/face-makeup',
 '/shop/eye-makeup',
 '/shop/lips-makeup',
 '/shop/cheek-makeup']

### Get each subcategory

In [254]:
def get_nth_hrefs(top_level_hrefs):
    nth_level_hrefs = []
    for link in top_level_hrefs:
        url = 'https://www.sephora.com' + link
        res = requests.get(url)
        res.content
        soup = BeautifulSoup(res.content, 'lxml')
        nth_level = soup.find_all('a',{'data-at':'nth_level'})
        for nth_link in nth_level:
            nth_level_hrefs.append(nth_link['href'])
        sleep(2)
    return nth_level_hrefs   

Focus specifically on just makeup products so return only makeup products subcategories

In [255]:
makeup_nth_hrefs = get_nth_hrefs(top_level_hrefs[6:10])

In [679]:
# Number of subcategories
len(makeup_nth_hrefs)

30

### Get Product Data

In [None]:
# Link format to see full list of products
# Must include ?pageSize=300 to view all items on a page
https://www.sephora.com/shop/foundation-makeup?pageSize=300

In [434]:
def get_data(hrefs):
    datas = []
    for href in hrefs:
        url = 'https://www.sephora.com' + href + '?pageSize=300'
        res = requests.get(url)
        res.content
        soup = BeautifulSoup(res.content, 'lxml')
        prod_dicts = json.loads(soup.find_all('script', {'id':'linkJSON'})[0].text)[2]["props"]["products"]
        for dic in prod_dicts:
            d = {}
            d['category'] = href
            d['brand_name'] = dic['brandName']
            test = dic['currentSku']
            for k,v in test.items():
                d[k] = v
            datas.append(d)
        print(f'{href} done')
        sleep(3)
    return datas        

In [435]:
full_product_data = get_data(makeup_nth_hrefs)

/shop/foundation-makeup done
/shop/bb-cc-cream-face-makeup done
/shop/tinted-moisturizer done
/shop/concealer done
/shop/makeup-primer-face-primer done
/shop/setting-powder-face-powder done
/shop/luminizer-luminous-makeup done
/shop/contour-palette-brush done
/shop/color-correcting done
/shop/complexion-sets done
/shop/eyeshadow-palettes done
/shop/mascara done
/shop/eyeliner done
/shop/eyebrow-makeup-pencils done
/shop/eyeshadow done
/shop/eyeshadow-primer-eye-primer done
/shop/under-eye-concealer done
/shop/fake-eyelashes-false-eyelashes done
/shop/eye-sets done
/shop/lipstick done
/shop/lip-gloss done
/shop/liquid-lipstick done
/shop/lip-balm-treatments-lips-makeup done
/shop/lip-stain done
/shop/lip-plumper done
/shop/lip-liner-lip-pencils done
/shop/lip-palettes-gloss-sets done
/shop/blush done
/shop/bronzer-makeup done
/shop/cheek-palettes done


In [436]:
df_products = pd.DataFrame(full_product_data)

In [438]:
df_products.to_csv('df_products_final_1.csv')

### Get Page Number for Reviews

We need to extract the total number of pages for reviews in order to scrape each page for the reviews. 

In [445]:
def get_page_nums(product_id):
    
    # Create soup 
    url = (f'http://reviews.sephora.com/8723abredes/{product_id}/reviews.htm?format=embedded&page=1')
    res = requests.get(url)
    res.content
    soup = BeautifulSoup(res.content, 'lxml')
    
    # Finding the maximum page number 
    soup.find_all('span', {'class':'BVRRPageLink BVRRPageNumber BVRRSelectedPageNumber'})
    pages = soup.find_all('div', {'class':'BVRRPager BVRRPageBasedPager'})
    length = len(str(pages).split('target="__TARGETFRAME__" title="'))
    
    # Return max page number 
    page_num = str(pages).split('target="__TARGETFRAME__" title="')[length-2].split('"')[0]
    return page_num

In [450]:
# Test
get_page_nums(df_products['skuId'][10])

'242'

In [453]:
# Extract total number of review pages for product 
count = 0 
df_products['review_count'] = None
for i, p in enumerate(df_products['skuId']):
    df_products['review_count'][i] = get_page_nums(p)
    count += 1
    if count % 300 == 0:
        print(f'{count} pages retrieved out of {len(df_products["skuId"])}')
    sleep(1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


300 pages retrieved out of 3534
600 pages retrieved out of 3534
900 pages retrieved out of 3534
1200 pages retrieved out of 3534
1500 pages retrieved out of 3534
1800 pages retrieved out of 3534
2100 pages retrieved out of 3534
2400 pages retrieved out of 3534
2700 pages retrieved out of 3534
3000 pages retrieved out of 3534
3300 pages retrieved out of 3534


In [456]:
df_products.to_csv('df_review_pages_incl.csv')

In [457]:
# Function returns empty list of no max page number (for reviews with only 1 page or no reviews yet)
# Input 1 for those cases
df_products['review_count'] = df_products['review_count'].map(lambda x: 1 if x == '[]' else x)

In [460]:
# input the product dataframe with all product_ids and count of pages 
def get_reviews(product_ids, pages):
    reviews = []
    for i, product_id in enumerate(product_ids):
        pages_1 = pages[i]
        
        for page in list(range(int(pages_1))):
            if page <= 5:
                url = (f'http://reviews.sephora.com/8723abredes/{product_id}/reviews.htm?format=embedded&page={page}')
                res = requests.get(url)
                res.content
                soup = BeautifulSoup(res.content, 'lxml')
                base_data_reviews = soup.find_all('span', {'itemprop': 'review'})

                
                
                for review in base_data_reviews:
                    f_review = ""
                    for row in review.find_all('span', {'class': 'BVRRReviewText'}):
                        f_review += row.text
                    info = {}
                    info['product_id'] = product_id
                    info['review'] = f_review
                    
                    if review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueeyeColor'}) is not None:
                        info['eye_color'] = review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueeyeColor'}).text
                    else:
                        info['eye_color'] = None
                    
                    if review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueskinTone'}) is not None:
                        info['skin_tone'] = review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueskinTone'}).text
                    else:
                        info['skin_tone'] = None
                    
                    if review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueskinType'}) is not None:
                        info['skin_type'] = review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueskinType'}).text
                    else:
                        info['skin_type'] = None
                        
                    if review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueage'}) is not None:
                        info['age_range'] = review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueage'}).text
                    else:
                        info['age_range'] = None
                        
                    if review.find('span', {'class': 'BVRRNumber BVRRRatingNumber'}) is not None:
                        info['rating'] = review.find('span', {'class': 'BVRRNumber BVRRRatingNumber'}).text
                    else:
                        info['rating'] = None
                    
                    if review.find('span', {'class': 'BVRRValue BVRRReviewTitle'}) is not None:
                        info['title'] = review.find('span', {'class': 'BVRRValue BVRRReviewTitle'}).text
                    else:
                        info['title'] = None
                        
                    info['user'] = review.find('span', {'class': 'BVRRNickname'}).text
                    reviews.append(info)
                sleep(1)            
    return reviews

In [462]:
reviews_data = get_reviews(df_products['skuId'], df_products['review_count'])

In [463]:
reviews_df_1 = pd.DataFrame(reviews_data)

In [468]:
reviews_df_1.to_csv('reviews_data_final.csv')

For each product, there is a max of 30 reviews. There are 6 reviews on each page. 

### Get Summaries for Products

Each product also includes keywords that are used most in the reviews. This will be a good feature to include in the model

In [476]:
summaries = []
count = 0 
for product_id in df_products['skuId']:
    url = (f'http://reviews.sephora.com/8723abredes/{product_id}/reviews.htm?format=embedded&page=1')
    res = requests.get(url)
    res.content
    soup = BeautifulSoup(res.content, 'lxml')
    quick_take = soup.find_all('td', {'id': 'BVRRQuickTakeProTagsID'})
    count +=1
    try:
        quick_list = quick_take[0].find_all('a')
        quick_count = quick_take[0].find_all('span')

        for i, row in enumerate(quick_list[:-1]):
            summary = {}
            summary['key_word'] = quick_list[i].text
            summary['key_word_count'] = quick_count[i].text
            summary['product_id'] = product_id
            summaries.append(summary)
    except:
        pass
    if count % 300 == 0:
        print(f'{count} out of {len(df_products["skuId"])}')
    sleep(1)

In [17]:
summaries_df = pd.DataFrame(summaries)

In [19]:
summaries_df.to_csv('summaries_final.csv')

### Get aggregate ratings

Each product also has aggregate ratings laid out as well. There is a count for the number of 1-5 star ratings. Not everyone writes a review so this rating is also useful to have as a feature.

In [475]:
agg_rating = []
count_track = 0
for p in df_products['skuId']:
    aggs = {}
    url = (f'http://reviews.sephora.com/8723abredes/{p}/reviews.htm?format=embedded&page=1')
    res = requests.get(url)
    res.content
    soup = BeautifulSoup(res.content, 'lxml')
    rating = soup.find_all('div', {'class': 'BVRRRatingSummaryColumn'})
    count_track +=1
    try:
        avg = rating[0].find('span', {'itemprop':'ratingValue'}).text
        count = rating[0].find('span', {'class':'BVRRCount BVRRNonZeroCount'}).text

        ind_star = rating[0].find('div', {'class':'BVRRHistogramContent'}).find_all('span')

        aggs[ind_star[1].text] = ind_star[2].text
        aggs[ind_star[4].text] = ind_star[5].text
        aggs[ind_star[7].text] = ind_star[8].text
        aggs[ind_star[10].text] = ind_star[11].text
        aggs[ind_star[13].text] = ind_star[14].text
        aggs['avg_rating'] = avg
        aggs['review_count'] = count
        aggs['product_id'] = p
        agg_rating.append(aggs)
    except:
        pass
    if count_track % 300 == 0:
        print(f'{count_track} out of {len(df_products["skuId"])}')
    
    sleep(1)

In [224]:
agg_rating_df = pd.DataFrame(agg_rating)

In [226]:
agg_rating_df.to_csv('agg_rating.csv')

In total, there are 4 datasets. The product information dataset, the reviews dataset, the summary (review keywords) data set and aggregate ratings dataset.