# Capstone: Scraping Sephora Reviews

### Problem Statement

How should sephora group its customer base by its reviews? How do negative reviews impact whether or not an itme goes on sale

#### Collect Data

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from time import sleep

In [3]:
url = 'https://www.sephora.com/shop/makeup-cosmetics'
res = requests.get(url)
res.status_code

200

In [4]:
res.content
soup = BeautifulSoup(res.content, 'lxml')

### Get top level category

In [5]:
categories = soup.find('nav',{'class':"css-1ofufz3"})

In [6]:
top_level_category = categories.find_all('a', {'data-at':'top_level_category'})

In [7]:
top_level_category[0]['href']

'/best-selling-makeup'

In [8]:
# Grab the links to each of the top level categories on the makeup-cosmetics page 
top_level_hrefs = []
for row in top_level_category:
    top_level_hrefs.append(row['href'])

In [641]:
top_level_hrefs[:5]

['/best-selling-makeup',
 '/new-makeup',
 '/shop/makeup-kits-makeup-sets',
 '/shop/mini-makeup',
 '/shop/makeup-palettes']

In [676]:
# Specifically looking for just makeup products
top_level_hrefs[6:10]

['/shop/face-makeup',
 '/shop/eye-makeup',
 '/shop/lips-makeup',
 '/shop/cheek-makeup']

### Get each subcategory

In [637]:
def get_nth_hrefs(top_level_hrefs):
    nth_level_hrefs = []
    for link in top_level_hrefs:
        url = 'https://www.sephora.com' + link
        res = requests.get(url)
        res.content
        soup = BeautifulSoup(res.content, 'lxml')
        nth_level = soup.find_all('a',{'data-at':'nth_level'})
        for nth_link in nth_level:
            nth_level_hrefs.append(nth_link['href'])
    return nth_level_hrefs   

In [638]:
nth_hrefs = get_nth_hrefs(top_level_hrefs)

In [640]:
nth_hrefs[:5]

['/shop/foundation-makeup',
 '/shop/bb-cc-cream-face-makeup',
 '/shop/tinted-moisturizer',
 '/shop/concealer',
 '/shop/makeup-primer-face-primer']

In [677]:
makeup_nth_hrefs = get_nth_hrefs(top_level_hrefs[6:10])

In [679]:
len(makeup_nth_hrefs)

30

### Get each product link

In [409]:
url = 'https://www.sephora.com/shop/foundation-makeup'
res = requests.get(url)
res.content
soup = BeautifulSoup(res.content, 'lxml')

In [412]:
product_grid = soup.find_all('script', {'type':"application/ld+json"})

In [18]:
product_grid[1].text.split('Product')[1][3:-11]

'name":"Pro Filt\'r Hydrating Longwear Foundation","category":"Foundation","@context":"http://schema.org","brand":"FENTY BEAUTY by Rihanna","url":"https://www.sephora.com/product/pro-filt-r-hydrating-longwear-foundation-P448702"},{"offers":{"seller":{"@type":"Organization","name":"Sephora"},"priceCurrency":"USD","@type":"Offer","price":"68.00","availability":"http://schema.org/InStock","sku":"2257111"},"image":"https://www.sephora.com/productimages/sku/s2257111-main-grid.jpg'

In [20]:
ex_list = product_grid[1].text.split('Product')[-1].split(',')

In [21]:
ex_list

['"',
 '"name":"Stay-Matte Sheer Pressed Powder"',
 '"category":"Foundation"',
 '"@context":"http://schema.org"',
 '"brand":"CLINIQUE"',
 '"url":"https://www.sephora.com/product/stay-matte-sheer-pressed-powder-P122748"}]']

In [22]:
[i for i in ex_list if 'url' in i]

['"url":"https://www.sephora.com/product/stay-matte-sheer-pressed-powder-P122748"}]']

In [23]:
example = product_grid[1].text.split("Product")[1]

In [24]:
example.split(',')[1:-1][4].split(":", 1)[1][1:-2]

'https://www.sephora.com/product/pro-filt-r-hydrating-longwear-foundation-P448702'

In [504]:
product_data = []
for row in product_grid[1].text.split('"priceCurrency":"USD"'):
    product_data.append(row.split(','))


In [605]:
product_data[2].remove('"name":"Sephora"}')

In [614]:
product_data[6]

['',
 '"lowPrice":"42.00"',
 '"@type":"AggregateOffer"',
 '"highPrice":"64.00"}',
 '"image":"https://www.sephora.com/productimages/sku/s2079168-main-grid.jpg"',
 '"@type":"Product"',
 '"name":"Luminous Silk Foundation"',
 '"category":"Foundation"',
 '"@context":"http://schema.org"',
 '"brand":"Giorgio Armani Beauty"',
 '"url":"https://www.sephora.com/product/luminous-silk-foundation-P393401"}',
 '{"offers":{']

In [506]:
product_url = []
for product in product_data[1:]:
    for row in product:
        if 'url' in row:
            product_url.append(re.sub('["]', "", row.split(':',1)[1][1:-2]))

In [507]:
product_url[:5]

['https://www.sephora.com/product/pro-filt-r-hydrating-longwear-foundation-P448702',
 'https://www.sephora.com/product/skin-fetish-sublime-perfection-foundation-P447519',
 'https://www.sephora.com/product/pro-filtr-soft-matte-longwear-foundation-P87985432',
 'https://www.sephora.com/product/flex-foundation-stick-P448151',
 'https://www.sephora.com/product/double-wear-stay-in-place-makeup-P378284']

In [508]:
def get_product_url(url):
    url = url
    res = requests.get(url)
    res.content
    soup = BeautifulSoup(res.content, 'lxml')
    product_grid = soup.find_all('script', {'type':"application/ld+json"})
    
    product_data = []
    for row in product_grid[1].text.split('"priceCurrency":"USD"'):
        product_data.append(row.split(','))
            
    product_url = []
    for product in product_data[1:]:
        for row in product:
            if 'url' in row:
                product_url.append(re.sub('["]', "", row.split(':',1)[1][1:-2]))
    return product_url

In [834]:
# Returns product id given the individual product url
def product_id(url_list):
    product_ids = []
    count = 0
    for i, url in enumerate(url_list):
        url = url
        res = requests.get(url)
        res.content
        soup = BeautifulSoup(res.content, 'lxml')
        div = soup.find('div', {'class':'css-1owflha'})
        try:
            product_ids.append(div.contents[1].text.split('ITEM')[-1].strip())
            count += 1
        except:
            pass
        if count % 10 == 0:
            print(f"Product_id {count} retrieved")
    return product_ids

In [835]:
# Takes in each sub level category as url
def make_product_df(url):
    product_df_data = []
    url = url
    res = requests.get(url)
    res.content
    soup = BeautifulSoup(res.content, 'lxml')
    product_grid = soup.find_all('script', {'type':"application/ld+json"})
    
    # Split data on priceCurrency 
    product_data = []
    for row in product_grid[1].text.split('"priceCurrency":"USD"'):
        product_data.append(row.split(','))
    
    # Remove name:Sephora to return product name 
    for product in product_data:
        if '"name":"Sephora"}' in product:
            product.remove('"name":"Sephora"}')
     
    # Product information 
    for product in product_data[1:]:
        products = {}    
        for row in product:
            if 'url' in row:
                products['url'] = re.sub('["]', "", row.split(':',1)[1][1:-2])
                
            if '"price"' in row:
                products['price'] = re.sub('["}]', "", row.split(":", 1)[1])
                
            if '"lowPrice"' in row:
                products['sale_price'] = re.sub('["}]', "", row.split(":", 1)[1])
                
            if '"highPrice"' in row:
                products['price'] = re.sub('["}]', "", row.split(":", 1)[1])

            if 'category' in row:
                products['category'] = re.sub('["}]', "", row.split(":", 1)[1])
                
            if 'brand' in row:
                products['brand'] = re.sub('["}]', "", row.split(":", 1)[1])
            
            if '"name"' in row:
                products['name'] = re.sub('["}]', "", row.split(":", 1)[1]) 
                
        product_df_data.append(products)
   
    product_df = pd.DataFrame(product_df_data)
    
    # Returns product_id from individual product url 
    try: 
        product_df['product_id'] = product_id(product_df['url'])
    except:
        pass
    return product_df

In [683]:
# Pop in nth_level_hrefs 
def make_final_product_df(url_list):
    for short_url in url_list:
        long_url = 'https://www.sephora.com'+ short_url
        make_product_df(long_url)

In [838]:
# make_final_product_df(nth_hrefs)

In [646]:
nth_hrefs[:3]

['/shop/foundation-makeup',
 '/shop/bb-cc-cream-face-makeup',
 '/shop/tinted-moisturizer']

In [665]:
len(nth_hrefs)

246

In [837]:
makeup_nth_hrefs[0]

'/shop/foundation-makeup'

In [841]:
product_df_data = []
url = 'https://www.sephora/com/shop/bb-cc-cream-face-makeup'
res = requests.get(url)
res.content
soup = BeautifulSoup(res.content, 'lxml')
product_grid = soup.find_all('script', {'type':"application/ld+json"})

ConnectionError: HTTPSConnectionPool(host='www.sephora', port=443): Max retries exceeded with url: /com/shop/bb-cc-cream-face-makeup (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x1a33e7cd30>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

In [829]:
dfs = []
for link in makeup_nth_hrefs:
    url = 'https://www.sephora.com' + link
    df = make_product_df(url)
    dfs.append(df)
    time.sleep(5)

KeyError: 'url_1'

In [686]:
# Like 30?
dfs_test = pd.concat(dfs)

In [688]:
dfs_test.reset_index(inplace = True, drop= True)

In [821]:
dfs_test.shape

(1387, 7)

In [823]:
dfs_test

Unnamed: 0,brand,category,name,price,sale_price,url,product_id
0,FENTY BEAUTY by Rihanna,Foundation,Pro Filt'r Hydrating Longwear Foundation,35.00,,https://www.sephora.com/product/pro-filt-r-hyd...,2268274
1,PAT McGRATH LABS,Foundation,Skin Fetish: Sublime Perfection Foundation,68.00,,https://www.sephora.com/product/skin-fetish-su...,2257111
2,FENTY BEAUTY by Rihanna,Foundation,Pro Filt'r Soft Matte Longwear Foundation,35.00,,https://www.sephora.com/product/pro-filtr-soft...,2164671
3,MILK MAKEUP,Foundation,Flex Foundation Stick,36.00,,https://www.sephora.com/product/flex-foundatio...,2242105
4,Estée Lauder,Foundation,Double Wear Stay-in-Place Foundation,43.00,,https://www.sephora.com/product/double-wear-st...,2112167
5,Giorgio Armani Beauty,Foundation,Luminous Silk Foundation,64.00,42.00,https://www.sephora.com/product/luminous-silk-...,2079168
6,Laura Mercier,Foundation,Tinted Moisturizer Natural Skin Perfector Broa...,47.00,24.00,https://www.sephora.com/product/tinted-moistur...,2250603
7,Smashbox,Foundation,Studio Skin Full Coverage 24 Hour Foundation,36.00,,https://www.sephora.com/product/studio-skin-fu...,2251874
8,MAKE UP FOR EVER,Foundation,Ultra HD Invisible Cover Foundation,43.00,,https://www.sephora.com/product/ultra-hd-invis...,2246726
9,IT Cosmetics,Foundation,CC+ Cream with SPF 50+,39.00,15.00,https://www.sephora.com/product/your-skin-but-...,1868165


In [689]:
dfs_test.to_csv('dfs_test.csv')

### Get Product ID

In [241]:
url = 'https://www.sephora.com/product/pro-filt-r-hydrating-longwear-foundation-P448702'
res = requests.get(url)
res.content
soup = BeautifulSoup(res.content, 'lxml')

In [325]:
div = soup.find('div', {'class':'css-1owflha'})

In [276]:
type(div[0])

bs4.element.Tag

In [327]:
div.contents[0].text

"FENTY BEAUTY by RihannaPro Filt'r Hydrating Longwear Foundation"

In [316]:
div[0].text

"FENTY BEAUTY by RihannaPro Filt'r Hydrating Longwear FoundationSIZE 1.08 oz/ 32 mL•ITEM 22682745 reviews6213 lovesexclusive·online only"

In [329]:
div.contents[1].text.split("•")

['SIZE 1.08 oz/ 32 mL', 'ITEM 2268274']

In [39]:
def product_id(url_list):
    product_ids = []
    for i, url in enumerate(url_list):
        url = url
        res = requests.get(url)
        res.content
        soup = BeautifulSoup(res.content, 'lxml')
        div = soup.find('div', {'class':'css-1owflha'})
        product_ids.append(div.contents[1].text.split('ITEM')[-1].strip())
        print(f"Product_id {i} retrieved")
    return product_ids

In [388]:
url = 'https://www.sephora.com/product/stay-matte-sheer-pressed-powder-P122748'
res = requests.get(url)
res.content
soup = BeautifulSoup(res.content, 'lxml')
div = soup.find('div', {'class':'css-1owflha'})

In [389]:
div.contents[1].text.split('ITEM')[-1].strip()

'51573'

In [347]:
product_id(product_url[:5])

Product_id 0 retrieved
Product_id 1 retrieved
Product_id 2 retrieved
Product_id 3 retrieved
Product_id 4 retrieved


['2268274', '2257111', '2164671', '2242105', '2112167']

### Extract Reviews

In [44]:
url = 'http://reviews.sephora.com/8723abredes/2268274/reviews.htm?format=embedded&page=1'
res = requests.get(url)
res.content
soup = BeautifulSoup(res.content, 'lxml')

In [45]:
reviews = soup.find_all('span',{'class':'BVRRReviewText'})

In [46]:
reviews[0].text

"A natural finish and at the same time gives coverage. It didnt' feel cakey at all but it looked amazing. My skin was glowing but not greasy. When the Mattying foundation launched I didnt like it as it felt dry on my skin just like how some people felt. This is my new favorite foundation."

In [50]:
reviews[1]

<span class="BVRRReviewText">I wear 290 in the Soft Matte Longwear Foundation and found that 290 in this formulation is not an exact match even though it’s supposed to. It looked a bit grey so I went with 255. Not a perfect match but definitely made it work by buffing it out well (used about a half a pump). </span>

In [359]:
soup.find_all('span', {'class':'BVRRPageLink BVRRPageNumber BVRRSelectedPageNumber'})

[<span class="BVRRPageLink BVRRPageNumber BVRRSelectedPageNumber">1</span>]

In [361]:
pages = soup.find_all('div', {'class':'BVRRPager BVRRPageBasedPager'})

In [372]:
# [num for num in pages[0].text.split() if isinstance(num, int)]
pages[0].text.split()

['1', '|', '2', '|', '3', '|', '...next', '|']

In [377]:
page_nums = []
for string in pages[0].text.split():
    try:
        page_nums.append(int(string))
    except:
        pass

In [378]:
page_nums

[1, 2, 3]

In [None]:
# link format = review_url = 'http://reviews.sephora.com/8723abredes/{}/reviews.htm?format=embedded&page={}'.format(
#        product_id, page)

# source: https://github.com/nanafwu/sephora-reviews-nlp/blob/master/scrape.py


## Put it all together

In [803]:
def get_page_nums(product_id):
    url = (f'http://reviews.sephora.com/8723abredes/{product_id}/reviews.htm?format=embedded&page=1')
    res = requests.get(url)
    res.content
    soup = BeautifulSoup(res.content, 'lxml')
    soup.find_all('span', {'class':'BVRRPageLink BVRRPageNumber BVRRSelectedPageNumber'})
    pages = soup.find_all('div', {'class':'BVRRPager BVRRPageBasedPager'})
    length = len(str(pages).split('target="__TARGETFRAME__" title="'))
    page_num = str(pages).split('target="__TARGETFRAME__" title="')[length-2].split('"')[0]
    return page_num

In [401]:
get_reviews(product_ids[:1])

In [402]:
reviews_df = pd.DataFrame(reviews)

In [403]:
reviews_df

Unnamed: 0,age_range,eye_color,product_id,rating,review,skin_tone,skin_type,title,user
0,,Brown,2268274,5,A natural finish and at the same time gives co...,,Oily,Feels airy and fresh with natural coverage,veens
1,18-24,Brown,2268274,4,"Uh, at first I wanted to rate this a tad bit l...",Dark,Combination,Oh man...,KhrisGal
2,,Brown,2268274,4,I wear 290 in the Soft Matte Longwear Foundati...,Olive,Combination,,nichemarkit
3,35-44,Brown,2268274,5,Wow. This foundation is a drink of water that ...,Medium,Combination,Wow. A drink of water foundation.,northernrokz
4,over 54,Brown,2268274,5,first off I will say I am Always team Fenty! n...,Dark,Normal,CLAP CLAP RHI RHI,BUTTERCUP101


In [859]:
for i, p in enumerate(dfs_test['product_id']):
    dfs_test['review_count'][i] = get_page_nums(p)
    sleep(1)

In [865]:
dfs_test['review_pages'] = dfs_test['review_count'].map(lambda x: 1 if x == '[]' else x)

In [869]:
# input the product dataframe with all product_ids and count of pages 
def get_reviews(product_ids, pages):
    reviews = []
    for i, product_id in enumerate(product_ids):
        pages_1 = pages[i]
        
        for page in list(range(int(pages_1))):
            if page <= 3:
                url = (f'http://reviews.sephora.com/8723abredes/{product_id}/reviews.htm?format=embedded&page={page}')
                res = requests.get(url)
                res.content
                soup = BeautifulSoup(res.content, 'lxml')
                base_data_reviews = soup.find_all('span', {'itemprop': 'review'})

                
                
                for review in base_data_reviews:
                    f_review = ""
                    for row in review.find_all('span', {'class': 'BVRRReviewText'}):
                        f_review += row.text
                    info = {}
                    info['product_id'] = product_id
                    info['review'] = f_review
                    
                    if review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueeyeColor'}) is not None:
                        info['eye_color'] = review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueeyeColor'}).text
                    else:
                        info['eye_color'] = None
                    
                    if review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueskinTone'}) is not None:
                        info['skin_tone'] = review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueskinTone'}).text
                    else:
                        info['skin_tone'] = None
                    
                    if review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueskinType'}) is not None:
                        info['skin_type'] = review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueskinType'}).text
                    else:
                        info['skin_type'] = None
                        
                    if review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueage'}) is not None:
                        info['age_range'] = review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueage'}).text
                    else:
                        info['age_range'] = None
                        
                    if review.find('span', {'class': 'BVRRNumber BVRRRatingNumber'}) is not None:
                        info['rating'] = review.find('span', {'class': 'BVRRNumber BVRRRatingNumber'}).text
                    else:
                        info['rating'] = None
                    
                    if review.find('span', {'class': 'BVRRValue BVRRReviewTitle'}) is not None:
                        info['title'] = review.find('span', {'class': 'BVRRValue BVRRReviewTitle'}).text
                    else:
                        info['title'] = None
                        
                    info['user'] = review.find('span', {'class': 'BVRRNickname'}).text
                    reviews.append(info)
        sleep(1)            
    return reviews

In [870]:
reviews_test = get_reviews(dfs_test['product_id'], dfs_test['review_pages'])

In [873]:
reviews_df = pd.DataFrame(reviews_test)

In [875]:
reviews_df.to_csv('reviews_data.csv')

In [2]:
df_product = pd.read_csv('dfs_test.csv')

In [3]:
df_reviews= pd.read_csv('reviews_data.csv')

In [None]:
# input the product dataframe with all product_ids and count of pages 
def get_summary(product_ids):
    summaries = []
    for p_id in product_ids:
        url = (f'http://reviews.sephora.com/8723abredes/{product_id}/reviews.htm?format=embedded&page=1')
        res = requests.get(url)
        res.content
        soup = BeautifulSoup(res.content, 'lxml')
        base_data_reviews = soup.find_all('span', {'itemprop': 'review'})
                
        for review in base_data_reviews:
            f_review = ""
            for row in review.find_all('span', {'class': 'BVRRReviewText'}):
                f_review += row.text
            info = {}
            info['product_id'] = product_id
            info['review'] = f_review

            if review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueeyeColor'}) is not None:
                info['eye_color'] = review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueeyeColor'}).text
            else:
                info['eye_color'] = None

            if review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueskinTone'}) is not None:
                info['skin_tone'] = review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueskinTone'}).text
            else:
                info['skin_tone'] = None

            if review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueskinType'}) is not None:
                info['skin_type'] = review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueskinType'}).text
            else:
                info['skin_type'] = None

            if review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueage'}) is not None:
                info['age_range'] = review.find('span', {'class':'BVRRValue BVRRContextDataValue BVRRContextDataValueage'}).text
            else:
                info['age_range'] = None

            if review.find('span', {'class': 'BVRRNumber BVRRRatingNumber'}) is not None:
                info['rating'] = review.find('span', {'class': 'BVRRNumber BVRRRatingNumber'}).text
            else:
                info['rating'] = None

            if review.find('span', {'class': 'BVRRValue BVRRReviewTitle'}) is not None:
                info['title'] = review.find('span', {'class': 'BVRRValue BVRRReviewTitle'}).text
            else:
                info['title'] = None

            info['user'] = review.find('span', {'class': 'BVRRNickname'}).text
            reviews.append(info)
        sleep(1)            
    return reviews

In [5]:
df_product['product_id'][8]

2246726

In [131]:
def get_summary_content(product_ids):
    summaries = []
    for p in product_ids:
        url = (f'http://reviews.sephora.com/8723abredes/{p}/reviews.htm?format=embedded&page=1')
        res = requests.get(url)
        res.content
        soup = BeautifulSoup(res.content, 'lxml')
        quick_take = soup.find_all('td', {'id': 'BVRRQuickTakeProTagsID'})
        quick_list = quick_take[0].find_all('a')
        quick_count = quick_take[0].find_all('span')
    
        for i, row in enumerate(quick_list[:-1]):
            summary = {}
            summary['key_word'] = quick_list[i].text
            summary['key_word_count'] = quick_count[i].text
            summary['product_id'] = p
            summaries.append(summary)
        sleep(1)
    return summaries
        

In [160]:
df_product['pid_string'] = df_product['product_id'].map(lambda x: str(x).strip())

In [167]:
df_product['pid_string'][15:20]

15    2169845
16    2247922
17    2224848
18    2031011
19    2070571
Name: pid_string, dtype: object

In [171]:
def get_summary(product_ids):
    summaries = []
    for product_id in product_ids:
        url = (f'http://reviews.sephora.com/8723abredes/{product_id}/reviews.htm?format=embedded&page=1')
        res = requests.get(url)
        res.content
        soup = BeautifulSoup(res.content, 'lxml')
        quick_take = soup.find_all('td', {'id': 'BVRRQuickTakeProTagsID'})
        quick_list = quick_take[0].find_all('a')
        quick_count = quick_take[0].find_all('span')

        for i, row in enumerate(quick_list[:-1]):
            summary = {}
            summary['key_word'] = quick_list[i].text
            summary['key_word_count'] = quick_count[i].text
            summary['product_id'] = product_id
            summaries.append(summary)
        sleep(2)
    
    return summaries

In [186]:

summaries = []
count = 0 
for product_id in df_product['pid_string']:
    url = (f'http://reviews.sephora.com/8723abredes/{product_id}/reviews.htm?format=embedded&page=1')
    res = requests.get(url)
    res.content
    soup = BeautifulSoup(res.content, 'lxml')
    quick_take = soup.find_all('td', {'id': 'BVRRQuickTakeProTagsID'})
    count +=1
    try:
        quick_list = quick_take[0].find_all('a')
        quick_count = quick_take[0].find_all('span')

        for i, row in enumerate(quick_list[:-1]):
            summary = {}
            summary['key_word'] = quick_list[i].text
            summary['key_word_count'] = quick_count[i].text
            summary['product_id'] = product_id
            summaries.append(summary)
    except:
        pass
    if count % 100 == 0:
        print(count)
    sleep(1)
    
    


100
200
300
400
500
600
700
800
900
1000
1100
1200
1300


In [188]:
summary_data = pd.DataFrame(summaries)

In [190]:
summary_data.to_csv('summary_data.csv')

In [199]:
summary_data.head()

Unnamed: 0,key_word,key_word_count,product_id
0,long wearing,"(1,165)",2112167
1,buildable coverage,(905),2112167
2,oil-free,(608),2112167
3,full coverage,(77),2112167
4,cakey,(26),2112167


In [184]:
df_product['product_id'][32:35]

32    1787571
33    1507367
34    2261865
Name: product_id, dtype: int64

In [173]:
summs = get_summary(df_product['pid_string'])

IndexError: list index out of range

In [170]:
summaries

[{'key_word': 'long wearing',
  'key_word_count': '(1,239)',
  'product_id': '2169845'},
 {'key_word': 'buildable coverage',
  'key_word_count': '(1,060)',
  'product_id': '2169845'},
 {'key_word': 'oil-free', 'key_word_count': '(773)', 'product_id': '2169845'},
 {'key_word': 'full coverage',
  'key_word_count': '(35)',
  'product_id': '2169845'},
 {'key_word': 'matte', 'key_word_count': '(29)', 'product_id': '2169845'},
 {'key_word': 'lightweight',
  'key_word_count': '(25)',
  'product_id': '2169845'},
 {'key_word': 'flawless', 'key_word_count': '(19)', 'product_id': '2169845'},
 {'key_word': 'natural', 'key_word_count': '(16)', 'product_id': '2169845'},
 {'key_word': 'natural finish',
  'key_word_count': '(13)',
  'product_id': '2169845'},
 {'key_word': 'smooth', 'key_word_count': '(13)', 'product_id': '2169845'},
 {'key_word': 'light', 'key_word_count': '(11)', 'product_id': '2169845'},
 {'key_word': 'light weight',
  'key_word_count': '(11)',
  'product_id': '2169845'},
 {'key_wor

In [154]:
df_product['product_id'][:2].map(lambda x: str(x).strip())

0    2268274
1    2257111
Name: product_id, dtype: object

In [151]:
summaries

[]

In [134]:
get_summary_content(df_product['product_id'][:1])

[]

In [38]:
url = (f'http://reviews.sephora.com/8723abredes/2246726/reviews.htm?format=embedded&page=1')
res = requests.get(url)
res.content
soup = BeautifulSoup(res.content, 'lxml')
# base_data_reviews = soup.find_all('span', {'itemprop': 'review'})

In [58]:
hm = soup.find_all('td', {'id': 'BVRRQuickTakeProTagsID'})

In [120]:
len(hm[0].find_all('li'))

30

In [125]:
len(hm[0].find_all('a'))

31

In [126]:
# key word
hm[0].find_all('a')[-1].text

'See All'

In [129]:
# number of key word uses
len(hm[0].find_all('span'))

30

In [37]:
def get_aggregate(product_ids):
    for p in product_ids:
        url = (f'http://reviews.sephora.com/8723abredes/{p}/reviews.htm?format=embedded&page=1')
        res = requests.get(url)
        res.content
        soup = BeautifulSoup(res.content, 'lxml')
        agg_r = soup.find_all('span', {'itemprop': 'aggregateRating'})
        rating = agg_r[0].find('span').text
        print(rating) 
        sleep(1)

In [33]:
df_product['product_id'][6:10]

6    2250603
7    2251874
8    2246726
9    1868165
Name: product_id, dtype: int64

In [36]:
agg_rating = []
for p in df_product['product_id'][8:10]:
    url = (f'http://reviews.sephora.com/8723abredes/{p}/reviews.htm?format=embedded&page=1')
    res = requests.get(url)
    res.content
    soup = BeautifulSoup(res.content, 'lxml')
    agg_r = soup.find_all('span', {'itemprop': 'aggregateRating'})
    rating = agg_r[0].find('span').text
    
    


3.9
4


In [222]:
agg_rating = []
for p in df_product['pid_string']:
    aggs = {}
    url = (f'http://reviews.sephora.com/8723abredes/{p}/reviews.htm?format=embedded&page=1')
    res = requests.get(url)
    res.content
    soup = BeautifulSoup(res.content, 'lxml')
    rating = soup.find_all('div', {'class': 'BVRRRatingSummaryColumn'})
    
    try:
        avg = rating[0].find('span', {'itemprop':'ratingValue'}).text
        count = rating[0].find('span', {'class':'BVRRCount BVRRNonZeroCount'}).text

        ind_star = rating[0].find('div', {'class':'BVRRHistogramContent'}).find_all('span')

        aggs[ind_star[1].text] = ind_star[2].text
        aggs[ind_star[4].text] = ind_star[5].text
        aggs[ind_star[7].text] = ind_star[8].text
        aggs[ind_star[10].text] = ind_star[11].text
        aggs[ind_star[13].text] = ind_star[14].text
        aggs['avg_rating'] = avg
        aggs['review_count'] = count
        aggs['product_id'] = p
        agg_rating.append(aggs)
    except:
        pass
    
    sleep(1)
    
    
    


In [98]:
rating = soup.find_all('div', {'class': 'BVRRRatingSummaryColumn'})

In [224]:
agg_rating_df = pd.DataFrame(agg_rating)

In [226]:
agg_rating_df.to_csv('agg_rating.csv')

In [227]:
agg_rating_df.shape

(965, 8)

In [99]:
# Overall Rating 
rating[0].find('span', {'itemprop':'ratingValue'}).text

'3.9'

In [101]:
rating[0].find('span', {'class':'BVRRCount BVRRNonZeroCount'}).text

'9,565 reviews'

In [217]:
# Star count
rating[0].find('div', {'class':'BVRRHistogramContent'}).find_all('span')[13].text

'1 star'