In [None]:
# so why lear to scrape websites?
# if you want to hone your ds skills, play around with data, do a pet project, you need 
# a dataset. There are plenty of datasets around on kaggle, government websites etc
# and you can come up with plenty of questions that can be answered using those dataset
# but what if you're interested in a specific subject/question and there's no data for
# it? sometimes you'd be able to get them by web-scraping


# scraping websites is an (extremely) iterative process
# basic workflow:
# 1. find the data you want on the web
# 2. inspect the webpages, identify the elements that you want
# 3. write code to parse the elements


# you'd start small and test code on one page but when you start implementing your code
# on other pages, often times you'll find that you have to modify your code to generalize
# well; compartmentalize code to keep it clean and manageable
# so I'm gonna go through how i scraped data on one page then show you how it fit into
# the entire workflow

# another point i want to mention is that I'm scraping a static page. I'll talk about
# scraping interactive javascript-rendered websites at the end

# if anybody has any suggestions, comments, questions, better ways of doing things,
# please feel free to let me know

In [None]:
# useful links:
# https://www.w3schools.com/html/html_basic.asp
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [None]:
# this is part of a project i'm doing, build a recommender system for skincare products
# for this, i need to have a dataset of users ratings of skincare products and the
# characteristics of users


In [94]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import bs4
from bs4 import BeautifulSoup
import time
import re
import math
import string
import collections
import dill
from requests_futures.sessions import FuturesSession
from retrying import retry

# 1. Scrape static websites using Requests and BeautifulSoup

In [4]:
# my goal is to get the name of the product, the general info of the product, then for
# user ratings, i want to get username, their rating, the date, age, skin, hair, eyes

# https://www.makeupalley.com/product/searching.asp?Brand=&BrandName=&CategoryID=709&title=

demo_link = 'https://www.makeupalley.com/product/showreview.asp/ItemId=151441/TL-Tightening-Neck-Cream/StriVectin/Neck/Decollete-Cream'
# most of the websites are html files. And html files are just text files with special
# markup.
# we'll use the requests package to the webpage and get the content of the webpage
r = requests.get(demo_link)
html_doc = r.text

In [5]:
# this is the webpage we saw in raw text form
html_doc



In [6]:
# in order to make sense of the structure, we'll use BeautifulSoup to parse this text
# into a html tree form
soup = BeautifulSoup(html_doc, "html5lib")

In [8]:
print(soup.prettify())
# beautiful soup has parsed the html structure, tags, elements for us

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <title>
   StriVectin TL Tightening Neck Cream reviews, photo, ingredients  - Makeupalley
  </title>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="225670347474872" property="fb:app_id"/>
  <meta content="MakeupAlley" property="og:site_name"/>
  <meta content="index,follow" name="robots"/>
  <meta content="noarchive" name="robots"/>
  <meta content="noarchive" name="googlebot"/>
  <meta content="https://www.makeupalley.com/product/showreview.asp/ItemId=151441/TL-Tightening-Neck-Cream/StriVectin/Neck/Decollete-Cream" property="og:url"/>
  <meta content="article" property="og:type"/>
  <meta content="StriVectin TL Tightening Neck Cream reviews, photo, ingredients" property="og:title"/>
  <meta content="StriVectin  TL Tightening Neck Cream: rated 2.6 out of 5 on MakeupAlley.  See 19 member reviews,  ingredients and photo." property="og:description"/>
  <meta content="https://img.makeupal

In [None]:
# there are 2 types of html items that we want to remember: elements/tags and attributes
# an element has a start tag and end tag eg. title
# sometimes it wont have an explicit end tag but just a '/'
# elements sometimes have attributes. most popular is the href attribute to denote a link
# usually comes in the form of name='value'
# common attributes: href, id, class

In [None]:
# inspect product name, it's in the element <h1>

In [32]:
name_list = soup.find_all('h1')

In [15]:
name_list

[<h1><a class="track_Nav_Home" href="/"><img alt="MakeupAlley" class="discreet" height="51" src="/art/bs/Mua_Logo3x.png" width="247"/></a></h1>,
 <h1>StriVectin TL Tightening Neck Cream</h1>]

In [29]:
# select() method offers more flexibility

name = soup.select('div#main h1')
# name2 = soup.select('div.col-sm-7 h1')

# tag2 doesnt have to be the direct tag/descendent of tag1

In [33]:
name

[<h1>StriVectin TL Tightening Neck Cream</h1>]

In [39]:
print(type(name[0]))
name[0].text

<class 'bs4.element.Tag'>


'StriVectin TL Tightening Neck Cream'

In [45]:
rating = soup.find('h3').text

In [46]:
rating

'2.6'

In [47]:
rating = soup.select('div.product-review-stats h3')

In [48]:
rating

[<h3>2.6</h3>]

In [49]:
rating[0].text

'2.6'

In [52]:
# what if the data we want is in an attribute?
# eg. getting links from href attribute is a popular task
brand_list = soup.find_all(class_ = "track_BreadCrumbs_Brand")
brand_href = brand_list[0].get('href')

In [53]:
brand_list

[<a class="track_BreadCrumbs_Brand" href="/product/searching.asp/Brand=1694/brandname=StriVectin/" itemprop="url"><span itemprop="title">StriVectin</span></a>]

In [70]:
brand_list[0].get('href')

'/product/searching.asp/Brand=1694/brandname=StriVectin/'

In [71]:
# for i in brand_list[0].children:
#     print(i)
#     print(i.get('itemprop'))
# #     for child in i:
# #         print(child)

In [72]:
# helper variables
base_url = 'https://www.makeupalley.com'
# have to use https because using http will make it default to the first review page
part_prod_link = '/product/showreview.asp/ItemId='
part_brand_link = '/product/searching.asp/Brand='


def go_link(link):
    r = requests.get(link)
    html_doc = r.text
    return BeautifulSoup(html_doc, "html5lib")

def go_prod(link):
    """
    Function go to each product link and get data
    :param link: string that represents the latter part of the link to product review pages
    :return: variables related to product reviews
    """

    soup = go_link(base_url+link)

    if len(soup.select('title')) == 0:
        pass
    else:
        # find users, characteristics and reviews; and ratings and dates
        users = soup.find_all(class_='user-name')
        user_list = [i.text for i in users]
        # dont have to go to each user and see how many reviews they have because we basically dont have a minimum for users

        chars = soup.find_all(class_='important')
        char_list = [i.text for i in chars]

        reviews = soup.find_all(class_='comment-content')
        review_list = [i.text for i in reviews]

        # find user rating/lipies for product
        lipies = soup.select('div.lipies span[class*="l-"]')
        # span is a tag, class is an attribute, choose only attribute with value starts with l-
        lipie_list = [i['class'][0] for i in lipies]
        
        # find review dates
        date = soup.select('div.date')
        # match either div.date or time tag
        date_list = [i.text for i in date]
        
        # number of reviews on each page
        n = len(user_list)


        # find name of product
        name_list = soup.find_all('h1')
        name = name_list[1].text
        name = [name] * n

        # find rating of product
        rating = soup.find('h3').text
        rating = [rating] * n

        repurchase = soup.find_all('p', string=re.compile('would repurchase'))[0].text
        repurchase = [repurchase] * n

        pkg_qual = soup.find_all('p', string=re.compile('Package Quality'))[0].text
        pkg_qual = [pkg_qual] * n

        price = soup.find_all('p', string=re.compile('Price'))[0].text
        price = [price] * n

        ingredient = soup.find_all(id='hold-ingredients')[0].text
        ingredient = [ingredient] * n

        # find brand of product
        brand_list = soup.find_all(class_ = "track_BreadCrumbs_Brand")
        brand = brand_list[0].text
        brand = [brand] * n


        return name, rating, repurchase, pkg_qual, price, ingredient, user_list, char_list, review_list, lipie_list, date_list, brand

In [73]:
a, b, c, d, e, f, g, h, k, l, m, n = go_prod('/product/showreview.asp/ItemId=151441/TL-Tightening-Neck-Cream/StriVectin/Neck/Decollete-Cream')

In [84]:
df = pd.DataFrame(
    {'names': a, 'ratings': b, 'repurchases': c
        , 'pkg_quals': d, 'prices': e, 'ingredients': f
        , 'brands': n
        , 'users': g, 'chars': h, 'reviews': k
        , 'lipies': l, 'dates': m})

In [87]:
df[:3]

Unnamed: 0,names,ratings,repurchases,pkg_quals,prices,ingredients,brands,users,chars,reviews,lipies,dates
0,StriVectin TL Tightening Neck Cream,2.6,36% would repurchase,Package Quality: 2.8,Price: $$$,"Water (Aqua), Butyrospermum Parkii (Shea Butte...",StriVectin,\t\t\t\t\t\t\t\tDmurphy431\t\t\t\t\t\t\t,"\t\t\tAge: 56 & Over\t\t\tSkin: Other, Fair-Me...",\t\t\t\t\t\t\tI should know that there is no s...,l-3-0,\t\t\t\ton 12/30/2018 5:24:00 PM\t\t\t
1,StriVectin TL Tightening Neck Cream,2.6,36% would repurchase,Package Quality: 2.8,Price: $$$,"Water (Aqua), Butyrospermum Parkii (Shea Butte...",StriVectin,\t\t\t\t\t\t\t\tLeslieCZ\t\t\t\t\t\t\t,"\t\t\tAge: 56 & Over\t\t\tSkin: Very Dry, Fair...",\t\t\t\t\t\t\tThis is similar to Revision Nect...,l-4-0,\t\t\t\ton 5/1/2017 7:56:00 PM\t\t\t
2,StriVectin TL Tightening Neck Cream,2.6,36% would repurchase,Package Quality: 2.8,Price: $$$,"Water (Aqua), Butyrospermum Parkii (Shea Butte...",StriVectin,\t\t\t\t\t\t\t\tnoBSbeauty\t\t\t\t\t\t\t,"\t\t\tAge: 25-29\t\t\tSkin: Dry, Fair-Medium, ...",\t\t\t\t\t\t\tThere are several issues with th...,l-1-0,\t\t\t\ton 4/7/2017 6:54:00 PM\t\t\t


# 2. Structure

In [None]:
# function to parse information from a page
# a wrapper of some sort to go through each of the review page of a product
# a function to collect all the products
# another helper function/wrapper to go through all pages

# 3. Checkpoints

In [95]:
# your code works and you're scraping along just fine. but what if your internet went
# down? or you forgot to set your computer not to go to sleep
# save your progress along the way. one way is to use a checkpoint package like pickle/dill
dill.dump(df, open('df.pkd', 'wb'))

In [96]:
df2 = dill.load(open('df.pkd', 'rb'))

In [98]:
df2[:3]

Unnamed: 0,names,ratings,repurchases,pkg_quals,prices,ingredients,brands,users,chars,reviews,lipies,dates
0,StriVectin TL Tightening Neck Cream,2.6,36% would repurchase,Package Quality: 2.8,Price: $$$,"Water (Aqua), Butyrospermum Parkii (Shea Butte...",StriVectin,\t\t\t\t\t\t\t\tDmurphy431\t\t\t\t\t\t\t,"\t\t\tAge: 56 & Over\t\t\tSkin: Other, Fair-Me...",\t\t\t\t\t\t\tI should know that there is no s...,l-3-0,\t\t\t\ton 12/30/2018 5:24:00 PM\t\t\t
1,StriVectin TL Tightening Neck Cream,2.6,36% would repurchase,Package Quality: 2.8,Price: $$$,"Water (Aqua), Butyrospermum Parkii (Shea Butte...",StriVectin,\t\t\t\t\t\t\t\tLeslieCZ\t\t\t\t\t\t\t,"\t\t\tAge: 56 & Over\t\t\tSkin: Very Dry, Fair...",\t\t\t\t\t\t\tThis is similar to Revision Nect...,l-4-0,\t\t\t\ton 5/1/2017 7:56:00 PM\t\t\t
2,StriVectin TL Tightening Neck Cream,2.6,36% would repurchase,Package Quality: 2.8,Price: $$$,"Water (Aqua), Butyrospermum Parkii (Shea Butte...",StriVectin,\t\t\t\t\t\t\t\tnoBSbeauty\t\t\t\t\t\t\t,"\t\t\tAge: 25-29\t\t\tSkin: Dry, Fair-Medium, ...",\t\t\t\t\t\t\tThere are several issues with th...,l-1-0,\t\t\t\ton 4/7/2017 6:54:00 PM\t\t\t


# 4. Scrape dynamic websites using Requests and Selenium

In [100]:
# useful websites
# http://stanford.edu/~mgorkove/cgi-bin/rpython_tutorials/Scraping_a_Webpage_Rendered_by_Javascript_Using_Python.php
# http://selenium-python.readthedocs.io/locating-elements.html
# https://stackoverflow.com/questions/22476112/using-chromedriver-with-selenium-python-ubuntu

from selenium import webdriver

In [124]:
# use special browser that Python can control
driver = webdriver.Chrome('/usr/lib/chromium-browser/chromedriver') 
#replace with .Firefox(), or with the browser of your choice
url_login = 'https://www.makeupalley.com/account/login.asp'
driver.get(url_login)

In [125]:
# log in
username = driver.find_element_by_id("UserName") #username form field
password = driver.find_element_by_id("Password") #password form field

In [115]:
# these are elements controled by Selenium/Python
username

<selenium.webdriver.remote.webelement.WebElement (session="1f84f5ce23afec3515296fce3ae99c53", element="0.056409107606746556-1")>

In [116]:
password

<selenium.webdriver.remote.webelement.WebElement (session="1f84f5ce23afec3515296fce3ae99c53", element="0.056409107606746556-2")>

In [126]:
# send username and password to the webpage
username.send_keys("testacount")
password.send_keys("password")

In [127]:
# click submit to login
submitButton = driver.find_element_by_id("login")
submitButton.click()

In [128]:
# set zoom level
driver.get('chrome://settings/')
driver.execute_script('chrome.settingsPrivate.setDefaultZoom(0.25);')

url = 'https://www.makeupalley.com'
driver.get(url)

driver.get('chrome://settings/')
driver.execute_script('chrome.settingsPrivate.setDefaultZoom(1);')

In [131]:
# with dynamic websites, sometimes you have to wait for all the elements to load
url = 'https://www.makeupalley.com'
driver.get(url)

time.sleep(5)
driver.get(demo_link)