## Second level scraper code to scrape the details of the restaurants and also the comments of the users

In [1]:
# import necessary libraries
from requests import get
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from random import randint
from time import time 
from IPython.core.display import clear_output
from warnings import warn
import datetime
import re

In [2]:
# retrieve dataframe of links scraped previously
links_df = pd.read_csv('./data/trip-advisor-scraper-main.csv')
links_df.head(10)

Unnamed: 0,Restaurant Name,Restaurant Type,Webpage
0,1. Positano @ RP,"Italian, European",https://www.tripadvisor.com.sg/Restaurant_Revi...
1,2. Grand Shanghai Restaurant,"Chinese, Asian",https://www.tripadvisor.com.sg/Restaurant_Revi...
2,3. Fu Lin Men (NSRCC),"Chinese, Seafood",https://www.tripadvisor.com.sg/Restaurant_Revi...
3,4. Entre-Nous creperie,"French, European",https://www.tripadvisor.com.sg/Restaurant_Revi...
4,5. NOX - Dine in the Dark,"European, Fusion",https://www.tripadvisor.com.sg/Restaurant_Revi...
5,6. The Mind Cafe,"Cafe, Pub",https://www.tripadvisor.com.sg/Restaurant_Revi...
6,7. Song Garden,"Chinese, Asian",https://www.tripadvisor.com.sg/Restaurant_Revi...
7,8. Fu Lin Men (CSC),"Chinese, Asian",https://www.tripadvisor.com.sg/Restaurant_Revi...
8,9. Melt Cafe,International,https://www.tripadvisor.com.sg/Restaurant_Revi...
9,10. Positano Risto,"Italian, Pizza",https://www.tripadvisor.com.sg/Restaurant_Revi...


In [3]:
# cleaning of name in the restaurant name column using regex
regex = '^([0-9])+\. '
links_df['Restaurant Name'].replace(regex,'',regex=True, inplace = True)
links_df[['Restaurant Name']].head(5)

Unnamed: 0,Restaurant Name
0,Positano @ RP
1,Grand Shanghai Restaurant
2,Fu Lin Men (NSRCC)
3,Entre-Nous creperie
4,NOX - Dine in the Dark


In [4]:
links_df.head(5)

Unnamed: 0,Restaurant Name,Restaurant Type,Webpage
0,Positano @ RP,"Italian, European",https://www.tripadvisor.com.sg/Restaurant_Revi...
1,Grand Shanghai Restaurant,"Chinese, Asian",https://www.tripadvisor.com.sg/Restaurant_Revi...
2,Fu Lin Men (NSRCC),"Chinese, Seafood",https://www.tripadvisor.com.sg/Restaurant_Revi...
3,Entre-Nous creperie,"French, European",https://www.tripadvisor.com.sg/Restaurant_Revi...
4,NOX - Dine in the Dark,"European, Fusion",https://www.tripadvisor.com.sg/Restaurant_Revi...


In [5]:
# extract the links out
#list(links_df["Webpage"])
# check if length is correct
len(list(links_df["Webpage"]))

1013

In [6]:
# store links as a list
LIST_OF_LINKS = list(links_df["Webpage"])
# show first 5 links
LIST_OF_LINKS[:5]

['https://www.tripadvisor.com.sg/Restaurant_Review-g294265-d21180746-Reviews-Positano_RP-Singapore.html',
 'https://www.tripadvisor.com.sg/Restaurant_Review-g294265-d1145149-Reviews-Grand_Shanghai_Restaurant-Singapore.html',
 'https://www.tripadvisor.com.sg/Restaurant_Review-g294265-d17171783-Reviews-Fu_Lin_Men_NSRCC-Singapore.html',
 'https://www.tripadvisor.com.sg/Restaurant_Review-g294265-d1193730-Reviews-Entre_Nous_creperie-Singapore.html',
 'https://www.tripadvisor.com.sg/Restaurant_Review-g294265-d4611806-Reviews-NOX_Dine_in_the_Dark-Singapore.html']

## Actual Scraping

In [7]:
# get the start value for every new page in yelp
start_of_new_page = [str(i) for i in range(0,501,10)]
# debug
print(start_of_new_page, end=' ')
print(f'\nNumber of pages scraped: {len(start_of_new_page)}')

['0', '10', '20', '30', '40', '50', '60', '70', '80', '90', '100', '110', '120', '130', '140', '150', '160', '170', '180', '190', '200', '210', '220', '230', '240', '250', '260', '270', '280', '290', '300', '310', '320', '330', '340', '350', '360', '370', '380', '390', '400', '410', '420', '430', '440', '450', '460', '470', '480', '490', '500'] 
Number of pages scraped: 51


In [8]:
def scraper(list_of_links, num_req, start, end):
    # redeclaring lists to store data in multiple values
    cust_names = []
    cust_ratings = []
    cust_comments = []
    res_names = []
    res_types = []

    # counter
    count = 0

    # flag variable to check the scrape
    # if unsuccessful scrape, try again
    unsuccessful = True

    # preparing the monitoring of the loop
    start_time = time()

    ### ----- 

    # for every comment in the interval of 10
    for link_raw in list_of_links[start:end]:
        loop_time = time()
        count+=1 # increment count to determine which link it is being scraped
        requests=1 # reset requests count for different webpage
        # print(f'----- LINK {count} -----')
        for pageStart in start_of_new_page:

            # Break the loop if the number of requests is greater than expected
            if requests > num_req:
                #warn('Number of requests was greater than expected.')
                break

            unsuccessful = True
            fail_count = 0

            while unsuccessful:
                # make a get request
                #response = get(f'https://www.yelp.com/biz/jumbo-seafood-singapore-4?start={pageStart}')
                #response = get(f'https://www.tripadvisor.com.sg/Restaurant_Review-g294265-d7348336-Reviews-or{pageStart}-Sunday_Folks-Singapore.html')
                link_array = link_raw.split('Reviews-')
                link = link_array[0] + 'Reviews-' + 'or' + str(pageStart) + "-" +link_array[1]
                # print(link)
                response = get(link)

                # pause the loop
                sleep(randint(1,2))

                # monitor the requests
                elapsed_time = time() - start_time
                print(f'LINK {count+start} REQUEST {requests}; Frequency: {requests/elapsed_time} requests/s')

                # Parse the content of the request with BeautifulSoup
                page_html = BeautifulSoup(response.text, 'html.parser')

                # get the comment container for all 20 comments in a page
                comment_containers = page_html.find_all('div', class_='review-container')

                if len(comment_containers) != 0:
                    print(f"REQUEST {requests}: SUCCESS --> Failed Count: {fail_count}")
                    clear_output(wait = True)
                    unsuccessful = False
                else:
                    fail_count+=1
                    #print(f"Request {requests}: unsuccessful scrape") # debug
                    pass
            requests += 1

            # for every comments in 10
            for com in comment_containers:
                # in case the scrape fail for that particular entry due to html tag issue
                try:
                    # append the restaurant name and type
                    res_names.append(links_df["Restaurant Name"][count+start-1])
                    res_types.append(links_df["Restaurant Type"][count+start-1])

                    # scrape the customer name
                    cust_name = com.find('div', class_='info_text pointer_cursor').text
                    cust_names.append(cust_name)

                    # scrape the customer ratings
                    cust_rating_raw = com.find('div', class_ ='ui_column is-9').span['class']

                    # convert the rating to an integer
                    cust_rating = 0
                    if cust_rating_raw[1] == 'bubble_50':
                        cust_rating = 5
                    elif cust_rating_raw[1] == 'bubble_40':
                        cust_rating = 4
                    elif cust_rating_raw[1] == 'bubble_30':
                        cust_rating = 3
                    elif cust_rating_raw[1] == 'bubble_20':
                        cust_rating = 2
                    elif cust_rating_raw[1] == 'bubble_10':
                        cust_rating = 1
                    else:
                        cust_rating = 0
                    cust_ratings.append(cust_rating)

                    # scrape the customer comment
                    cust_comment_raw = com.find('div', class_='entry').text

                    # get rid of the unwanted characters like "..." and "More"
                    comment_ = cust_comment_raw.replace("..."," ")

                    # remove instances with "More" tab if there are more words
                    cust_comment = ""
                    if comment_[-5:] == ".More":
                        cust_comment = comment_[:-4]
                    else:
                        cust_comment = comment_

                    cust_comments.append(cust_comment)
                except:
                    print(f'error in request {requests-1}')
                    continue

            # Throw a warning for non-200 status codes
            if response.status_code != 200:
                warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        # check time needed to exceute one link
        print(f'Time taken for scraping link {count+start}: {time() - loop_time} seconds')

    print('DONE')
    return cust_names, cust_ratings, cust_comments, res_names, res_types

In [9]:
# scraper call function
def scraper_call(list_of_links, num_req, start, end):
    cust_names, cust_ratings, cust_comments, res_names, res_types = scraper(list_of_links = list_of_links, 
                                                                            num_req=num_req, 
                                                                            start=start, 
                                                                            end=end)
    print(f'Number of entries: {len(cust_names)}')
    review = pd.DataFrame({
        'Restaurant Name': res_names,
        'Restaurant Type': res_types,
        'Reviewer\'s Name': cust_names,
        'Rating': cust_ratings,
        'Comment': cust_comments, 
    })

    print(review.info())
    review.to_csv('./data/trip-advisor-comments.csv', mode='a', index=False)

In [10]:
# determine number of requests (each request is 10 entries)
REQUESTS = 10
start_list = [s for s in range(1,1014,50)]
end_list = [e for e in range(50,1014,50)]

In [None]:
# test values
start_list

In [None]:
end_list

In [None]:
# link 0 - 50
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=0, end=end_list[0])

LINK 35 REQUEST 6; Frequency: 0.00360269083083521 requests/s
REQUEST 6: SUCCESS --> Failed Count: 0


In [None]:
# link 51 - 100
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[1], end=end_list[1])

In [None]:
# link 101 - 150
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[2], end=end_list[2])

In [None]:
# link 151 - 200
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[3], end=end_list[3])

In [None]:
# link 201 - 250
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[4], end=end_list[4])

In [None]:
# link 251 - 300
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[5], end=end_list[5])

In [None]:
# link 301 - 350
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[6], end=end_list[6])

In [None]:
# link 351 - 400
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[7], end=end_list[7])

In [None]:
# link 401 - 450
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[8], end=end_list[8])

In [None]:
# link 451 - 500
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[9], end=end_list[9])

In [None]:
# link 501 - 550
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[10], end=end_list[10])

In [None]:
# link 551 - 600
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[11], end=end_list[11])

In [None]:
# link 601 - 650
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[12], end=end_list[12])

In [None]:
# link 651 - 700
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[13], end=end_list[13])

In [None]:
# link 701 - 750
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[14], end=end_list[14])

In [None]:
# link 751 - 800
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[15], end=end_list[15])

In [None]:
# link 801 - 850
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[16], end=end_list[16])

In [None]:
# link 851 - 900
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[17], end=end_list[17])

In [None]:
# link 901 - 950
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[17], end=end_list[17])

In [None]:
# link 951 - 1000
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[18], end=end_list[18])

In [None]:
# link 1001 - 1013
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[19], end=1013)