In [1]:
# import necessary libraries
from requests import get
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from random import randint
from time import time 
from IPython.core.display import clear_output
from warnings import warn
import datetime
import re

In [2]:
data_raw = pd.read_csv('./data/restaurant_ratings.csv')
data_raw

Unnamed: 0.1,Unnamed: 0,Restaurant,Restaurant Type,Rating,Number of Ratings,Affordability,Reviews Link
0,0,Sungei Road Laksa,"['Singaporean', 'Chinese', 'Noodles']",4.5 star,69,$,yelp.com/biz/sungei-road-laksa-singapore-2?osq...
1,1,Tian Tian Hainanese Chicken Rice,"['Hainan', 'Chicken Shop']",4 star,378,$,yelp.com/biz/tian-tian-hainanese-chicken-rice-...
2,2,Coriander Leaf,['Asian Fusion'],4.5 star,11,,yelp.com/biz/coriander-leaf-singapore-2?osq=Re...
3,3,Holycrab,"['Singaporean', 'Chinese', 'Seafood']",4.5 star,17,$$$$,yelp.com/biz/holycrab-singapore?osq=Restaurants
4,4,Burnt Ends,"['Australian', 'Steakhouses', 'Barbeque']",4.5 star,72,$$$$,yelp.com/biz/burnt-ends-singapore?osq=Restaurants
...,...,...,...,...,...,...,...
215,215,Gram,"['Pancakes', 'Cafes']",4.5 star,2,,yelp.com/biz/gram-singapore-2?osq=Restaurants
216,216,Super Loco Customs House,"['Cocktail Bars', 'Mexican']",3.5 star,7,,yelp.com/biz/super-loco-customs-house-singapor...
217,217,Salted & Hung,['Australian'],4 star,14,$$$,yelp.com/biz/salted-and-hung-singapore?osq=Res...
218,218,Soon Kee Wanton Mee,['Singaporean'],4 star,1,,yelp.com/biz/soon-kee-wanton-mee-singapore?osq...


In [3]:
# remove unwanted columns
data = data_raw.drop(['Unnamed: 0', 
                     'Rating', 
                     'Number of Ratings',
                     'Affordability'],
                     axis=1)

In [4]:
data.head(5)

Unnamed: 0,Restaurant,Restaurant Type,Reviews Link
0,Sungei Road Laksa,"['Singaporean', 'Chinese', 'Noodles']",yelp.com/biz/sungei-road-laksa-singapore-2?osq...
1,Tian Tian Hainanese Chicken Rice,"['Hainan', 'Chicken Shop']",yelp.com/biz/tian-tian-hainanese-chicken-rice-...
2,Coriander Leaf,['Asian Fusion'],yelp.com/biz/coriander-leaf-singapore-2?osq=Re...
3,Holycrab,"['Singaporean', 'Chinese', 'Seafood']",yelp.com/biz/holycrab-singapore?osq=Restaurants
4,Burnt Ends,"['Australian', 'Steakhouses', 'Barbeque']",yelp.com/biz/burnt-ends-singapore?osq=Restaurants


In [5]:
# add https:// in front of the review links
data['Reviews Link'] = 'https://www.' + data['Reviews Link']

In [6]:
data.head(5)

Unnamed: 0,Restaurant,Restaurant Type,Reviews Link
0,Sungei Road Laksa,"['Singaporean', 'Chinese', 'Noodles']",https://www.yelp.com/biz/sungei-road-laksa-sin...
1,Tian Tian Hainanese Chicken Rice,"['Hainan', 'Chicken Shop']",https://www.yelp.com/biz/tian-tian-hainanese-c...
2,Coriander Leaf,['Asian Fusion'],https://www.yelp.com/biz/coriander-leaf-singap...
3,Holycrab,"['Singaporean', 'Chinese', 'Seafood']",https://www.yelp.com/biz/holycrab-singapore?os...
4,Burnt Ends,"['Australian', 'Steakhouses', 'Barbeque']",https://www.yelp.com/biz/burnt-ends-singapore?...


In [7]:
# check if length is correct
len(list(data["Reviews Link"]))

220

In [8]:
# store links as a list
LIST_OF_LINKS = list(data["Reviews Link"])
# show first 5 links
LIST_OF_LINKS[:5]

['https://www.yelp.com/biz/sungei-road-laksa-singapore-2?osq=Restaurants',
 'https://www.yelp.com/biz/tian-tian-hainanese-chicken-rice-singapore-7?osq=Restaurants',
 'https://www.yelp.com/biz/coriander-leaf-singapore-2?osq=Restaurants',
 'https://www.yelp.com/biz/holycrab-singapore?osq=Restaurants',
 'https://www.yelp.com/biz/burnt-ends-singapore?osq=Restaurants']

## Actual Scraping

In [9]:
# get the start value for every new page in yelp
start_of_new_page = [str(i) for i in range(0,381,20)]
# debug
print(start_of_new_page, end=' ')
print(f'\nNumber of pages scraped: {len(start_of_new_page)}')

['0', '20', '40', '60', '80', '100', '120', '140', '160', '180', '200', '220', '240', '260', '280', '300', '320', '340', '360', '380'] 
Number of pages scraped: 20


In [14]:
def scraper(list_of_links, num_req, start, end):
    # redeclaring lists to store data in multiple values
    cust_names = []
    cust_ratings = []
    cust_comments = []
    res_names = []
    res_types = []

    # counter
    count = 0

    # flag variable to check the scrape
    # if unsuccessful scrape, try again
    unsuccessful = True

    # preparing the monitoring of the loop
    start_time = time()

    ### ----- 

    # for every comment in the interval
    for link_raw in list_of_links[start:end+1]:
        loop_time = time()
        count+=1 # increment count to determine which link it is being scraped
        requests=1 # reset requests count for different webpage
        # print(f'----- LINK {count} -----')
        for pageStart in start_of_new_page:

            # Break the loop if the number of requests is greater than expected
            if requests > num_req:
                #warn('Number of requests was greater than expected.')
                break

            unsuccessful = True
            fail_count = 0
            repeat = 0

            while unsuccessful:
                # make a get request
                #response = get(f'https://www.yelp.com/biz/jumbo-seafood-singapore-4?start={pageStart}')
                #response = get(f'https://www.tripadvisor.com.sg/Restaurant_Review-g294265-d7348336-Reviews-or{pageStart}-Sunday_Folks-Singapore.html')
                
                #link_array = link_raw.split('Reviews-')
                link = link_raw + '&start=' + str(pageStart)
                # print(link)
                response = get(link)

                # pause the loop
                sleep(randint(1,4))

                # monitor the requests
                elapsed_time = time() - start_time
                print(f'LINK {count+start} REQUEST {requests}; Frequency: {requests/elapsed_time} requests/s')

                # Parse the content of the request with BeautifulSoup
                page_html = BeautifulSoup(response.text, 'html.parser')

                # get the comment container for all 20 comments in a page
                comment_containers = page_html.find_all('div', class_='review__373c0__13kpL border-color--default__373c0__3-ifU')

                if len(comment_containers) != 0:
                    print(f"REQUEST {requests}: SUCCESS --> Failed Count: {fail_count}")
                    clear_output(wait = True)
                    unsuccessful = False
                else:
                    fail_count+=1
                    repeat+=1
                    #print(f"Request {requests}: unsuccessful scrape") # debug
                if repeat >= 5:
                    print("Repeated 5 times --> cannot scrape")
                    break

            requests += 1

            # for every comments in 10
            for com in comment_containers:
                # in case the scrape fail for that particular entry due to html tag issue
                try:
                    # append the restaurant name and type
                    res_names.append(data["Restaurant"][count+start-1])
                    res_types.append(data["Restaurant Type"][count+start-1])

                    # scrape the customer name
                    cust_name = com.div.find('a', class_='link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE').text
                    cust_names.append(cust_name)

                    # scrape the customer ratings
                    cust_rating = com.find('div', class_='arrange__373c0__2C9bH gutter-1__373c0__2l5bx vertical-align-middle__373c0__1SDTo border-color--default__373c0__3-ifU').span.div['aria-label']
                    cust_ratings.append(cust_rating)

                    cust_comment_raw = com.find_all('div', class_='margin-b2__373c0__abANL border-color--default__373c0__3-ifU')
                    if len(cust_comment_raw) != 1:
                        temp = cust_comment_raw[1].text
                    else:
                        temp = cust_comment_raw[0].text

                    cust_comment = temp.replace(u'\xa0', u'')
                    cust_comments.append(cust_comment)
                except:
                    print(f'error in request {requests}')
                    continue

            # Throw a warning for non-200 status codes
            if response.status_code != 200:
                warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        # check time needed to exceute one link
        print(f'Time taken for scraping link {count+start}: {time() - loop_time} seconds')

    print('DONE')
    return cust_names, cust_ratings, cust_comments, res_names, res_types

In [15]:
# scraper call function
def scraper_call(list_of_links, num_req, start, end):
    cust_names, cust_ratings, cust_comments, res_names, res_types = scraper(list_of_links = list_of_links, 
                                                                            num_req=num_req, 
                                                                            start=start, 
                                                                            end=end)
    print(f'Number of entries: {len(cust_names)}')
    review = pd.DataFrame({
        'Restaurant Name': res_names,
        'Restaurant Type': res_types,
        'Reviewer\'s Name': cust_names,
        'Rating': cust_ratings,
        'Comment': cust_comments,
    })

    print(review.info())
    if start == 0:
        review.to_csv('./data/yelp-comments.csv', mode='a', index=False)
    else:
        review.to_csv('./data/yelp-comments.csv', mode='a', index=False, header=False)

In [16]:
# determine number of requests (each request is 10 entries)
REQUESTS = 10
start_list = [s for s in range(0,221,50)]
end_list = [e for e in range(49,221,50)]

In [17]:
# test values
print(start_list)
print(end_list)

[0, 50, 100, 150, 200]
[49, 99, 149, 199]


## CALLING OF FUNCTIONS

In [19]:
# link 0 - 49
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[0], end=end_list[0])

LINK 6 REQUEST 10; Frequency: 0.002693000708592062 requests/s
LINK 6 REQUEST 10; Frequency: 0.002685427693857951 requests/s
LINK 6 REQUEST 10; Frequency: 0.0026795556541745974 requests/s
LINK 6 REQUEST 10; Frequency: 0.002675293475064264 requests/s
LINK 6 REQUEST 10; Frequency: 0.0026677482822953305 requests/s
Repeated 5 times --> cannot scrape
LINK 6 REQUEST 11; Frequency: 0.002928389626161566 requests/s
LINK 6 REQUEST 11; Frequency: 0.0029212026566337417 requests/s
LINK 6 REQUEST 11; Frequency: 0.0029140965922120326 requests/s
LINK 6 REQUEST 11; Frequency: 0.0029062198613587914 requests/s
LINK 6 REQUEST 11; Frequency: 0.0028996519023415917 requests/s
Repeated 5 times --> cannot scrape
LINK 6 REQUEST 12; Frequency: 0.0031542032153086155 requests/s
LINK 6 REQUEST 12; Frequency: 0.0031459925982298383 requests/s
LINK 6 REQUEST 12; Frequency: 0.0031397873901904527 requests/s
LINK 6 REQUEST 12; Frequency: 0.003136628145574324 requests/s
LINK 6 REQUEST 12; Frequency: 0.0031302460801101934 r

KeyboardInterrupt: 

In [None]:
# link 50 - 99
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[1], end=end_list[1])

In [None]:
# link 100 - 149
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[2], end=end_list[2])

In [None]:
# link 150 - 199
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[3], end=end_list[3])

In [None]:
# link 200 - 220
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[4], end=219)