In [None]:
# import necessary libraries
from requests import get
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from random import randint
from time import time 
from IPython.core.display import clear_output
from warnings import warn
import datetime
import re

In [None]:
data_raw = pd.read_csv('./data/restaurant_ratings.csv')
data_raw

In [None]:
# remove unwanted columns
data = data_raw.drop(['Unnamed: 0', 
                     'Rating', 
                     'Number of Ratings',
                     'Affordability'],
                     axis=1)

In [None]:
data.head(5)

In [None]:
# add https:// in front of the review links
data['Reviews Link'] = 'https://www.' + data['Reviews Link']

In [None]:
data.head(5)

In [None]:
# check if length is correct
len(list(data["Reviews Link"]))

In [None]:
# store links as a list
LIST_OF_LINKS = list(data["Reviews Link"])
# show first 5 links
LIST_OF_LINKS[:5]

## Actual Scraping

In [None]:
# get the start value for every new page in yelp
start_of_new_page = [str(i) for i in range(0,381,20)]
# debug
print(start_of_new_page, end=' ')
print(f'\nNumber of pages scraped: {len(start_of_new_page)}')

In [None]:
def scraper(list_of_links, num_req, start, end):
    # redeclaring lists to store data in multiple values
    cust_names = []
    cust_ratings = []
    cust_comments = []
    res_names = []
    res_types = []

    # counter
    count = 0

    # flag variable to check the scrape
    # if unsuccessful scrape, try again
    unsuccessful = True

    # preparing the monitoring of the loop
    start_time = time()

    ### ----- 

    # for every comment in the interval
    for link_raw in list_of_links[start:end+1]:
        loop_time = time()
        count+=1 # increment count to determine which link it is being scraped
        requests=1 # reset requests count for different webpage
        # print(f'----- LINK {count} -----')
        for pageStart in start_of_new_page:

            # Break the loop if the number of requests is greater than expected
            if requests > num_req:
                #warn('Number of requests was greater than expected.')
                break

            unsuccessful = True
            fail_count = 0
            repeat = 0

            while unsuccessful:
                # make a get request
                #response = get(f'https://www.yelp.com/biz/jumbo-seafood-singapore-4?start={pageStart}')
                #response = get(f'https://www.tripadvisor.com.sg/Restaurant_Review-g294265-d7348336-Reviews-or{pageStart}-Sunday_Folks-Singapore.html')
                
                #link_array = link_raw.split('Reviews-')
                link = link_raw + '&start=' + str(pageStart)
                # print(link)
                response = get(link)

                # pause the loop
                sleep(randint(3,7))

                # monitor the requests
                elapsed_time = time() - start_time
                print(f'LINK {count+start} REQUEST {requests}; Frequency: {requests/elapsed_time} requests/s')

                # Parse the content of the request with BeautifulSoup
                page_html = BeautifulSoup(response.text, 'html.parser')

                # get the comment container for all 20 comments in a page
                comment_containers = page_html.find_all('div', class_='review__373c0__13kpL border-color--default__373c0__3-ifU')

                if len(comment_containers) != 0:
                    print(f"REQUEST {requests}: SUCCESS --> Failed Count: {fail_count}")
                    clear_output(wait = True)
                    unsuccessful = False
                else:
                    fail_count+=1
                    repeat+=1
                    #print(f"Request {requests}: unsuccessful scrape") # debug
                if repeat >= 8:
                    print("Repeated 8 times --> cannot scrape")
                    break

            requests += 1

            # for every comments in 10
            for com in comment_containers:
                # in case the scrape fail for that particular entry due to html tag issue
                try:
                    # append the restaurant name and type
                    res_names.append(data["Restaurant"][count+start-1])
                    res_types.append(data["Restaurant Type"][count+start-1])

                    # scrape the customer name
                    cust_name = com.div.find('a', class_='link__373c0__1G70M link-color--inherit__373c0__3dzpk link-size--inherit__373c0__1VFlE').text
                    cust_names.append(cust_name)

                    # scrape the customer ratings
                    cust_rating = com.find('div', class_='arrange__373c0__2C9bH gutter-1__373c0__2l5bx vertical-align-middle__373c0__1SDTo border-color--default__373c0__3-ifU').span.div['aria-label']
                    cust_ratings.append(cust_rating)

                    cust_comment_raw = com.find_all('div', class_='margin-b2__373c0__abANL border-color--default__373c0__3-ifU')
                    if len(cust_comment_raw) != 1:
                        temp = cust_comment_raw[1].text
                    else:
                        temp = cust_comment_raw[0].text

                    cust_comment = temp.replace(u'\xa0', u'')
                    cust_comments.append(cust_comment)
                except:
                    print(f'error in request {requests}')
                    continue

            # Throw a warning for non-200 status codes
            if response.status_code != 200:
                warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        # check time needed to exceute one link
        print(f'Time taken for scraping link {count+start}: {time() - loop_time} seconds')

    print('DONE')
    return cust_names, cust_ratings, cust_comments, res_names, res_types

In [None]:
# scraper call function
def scraper_call(list_of_links, num_req, start, end):
    cust_names, cust_ratings, cust_comments, res_names, res_types = scraper(list_of_links = list_of_links, 
                                                                            num_req=num_req, 
                                                                            start=start, 
                                                                            end=end)
    print(f'Number of entries: {len(cust_names)}')
    review = pd.DataFrame({
        'Restaurant Name': res_names,
        'Restaurant Type': res_types,
        'Reviewer\'s Name': cust_names,
        'Rating': cust_ratings,
        'Comment': cust_comments,
    })

    print(review.info())
    if start == 0:
        review.to_csv('./data/yelp-comments.csv', mode='a', index=False)
    else:
        review.to_csv('./data/yelp-comments.csv', mode='a', index=False, header=False)

In [None]:
# determine number of requests (each request is 10 entries)
REQUESTS = 20
start_list = [s for s in range(0,221,50)]
end_list = [e for e in range(49,221,50)]

In [None]:
# test values
print(start_list)
print(end_list)

## CALLING OF FUNCTIONS

In [None]:
# link 0 - 49
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[0], end=end_list[0])

In [None]:
# link 50 - 99
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[1], end=end_list[1])

In [None]:
# link 100 - 149
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[2], end=end_list[2])

In [None]:
# link 150 - 199
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[3], end=end_list[3])

In [None]:
# link 200 - 220
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[4], end=219)