## Second level scraper code to scrape the details of the restaurants and also the comments of the users

In [1]:
# import necessary libraries
from requests import get
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from random import randint
from time import time 
from IPython.core.display import clear_output
from warnings import warn
import datetime
import re

In [2]:
# retrieve dataframe of links scraped previously
links_df = pd.read_csv('./data/trip-advisor-scraper-main-url.csv')
links_df.head(10)

Unnamed: 0,Restaurant Name,Restaurant Type,Webpage
0,1. Positano @ RP,"Italian, European",https://www.tripadvisor.com.sg/Restaurant_Revi...
1,2. Grand Shanghai Restaurant,"Chinese, Asian",https://www.tripadvisor.com.sg/Restaurant_Revi...
2,3. Fu Lin Men (NSRCC),"Chinese, Seafood",https://www.tripadvisor.com.sg/Restaurant_Revi...
3,4. Entre-Nous creperie,"French, European",https://www.tripadvisor.com.sg/Restaurant_Revi...
4,5. NOX - Dine in the Dark,"European, Fusion",https://www.tripadvisor.com.sg/Restaurant_Revi...
5,6. The Mind Cafe,"Cafe, Pub",https://www.tripadvisor.com.sg/Restaurant_Revi...
6,7. Song Garden,"Chinese, Asian",https://www.tripadvisor.com.sg/Restaurant_Revi...
7,8. Fu Lin Men (CSC),"Chinese, Asian",https://www.tripadvisor.com.sg/Restaurant_Revi...
8,9. Melt Cafe,International,https://www.tripadvisor.com.sg/Restaurant_Revi...
9,10. Positano Risto,"Italian, Pizza",https://www.tripadvisor.com.sg/Restaurant_Revi...


In [3]:
# cleaning of name in the restaurant name column using regex
regex = '^([0-9])+\. '
links_df['Restaurant Name'].replace(regex,'',regex=True, inplace = True)
links_df[['Restaurant Name']].head(5)

Unnamed: 0,Restaurant Name
0,Positano @ RP
1,Grand Shanghai Restaurant
2,Fu Lin Men (NSRCC)
3,Entre-Nous creperie
4,NOX - Dine in the Dark


In [4]:
links_df.head(5)

Unnamed: 0,Restaurant Name,Restaurant Type,Webpage
0,Positano @ RP,"Italian, European",https://www.tripadvisor.com.sg/Restaurant_Revi...
1,Grand Shanghai Restaurant,"Chinese, Asian",https://www.tripadvisor.com.sg/Restaurant_Revi...
2,Fu Lin Men (NSRCC),"Chinese, Seafood",https://www.tripadvisor.com.sg/Restaurant_Revi...
3,Entre-Nous creperie,"French, European",https://www.tripadvisor.com.sg/Restaurant_Revi...
4,NOX - Dine in the Dark,"European, Fusion",https://www.tripadvisor.com.sg/Restaurant_Revi...


In [5]:
# extract the links out
#list(links_df["Webpage"])
# check if length is correct
len(list(links_df["Webpage"]))

1013

In [6]:
# store links as a list
LIST_OF_LINKS = list(links_df["Webpage"])
# show first 5 links
LIST_OF_LINKS[:5]

['https://www.tripadvisor.com.sg/Restaurant_Review-g294265-d21180746-Reviews-Positano_RP-Singapore.html',
 'https://www.tripadvisor.com.sg/Restaurant_Review-g294265-d1145149-Reviews-Grand_Shanghai_Restaurant-Singapore.html',
 'https://www.tripadvisor.com.sg/Restaurant_Review-g294265-d17171783-Reviews-Fu_Lin_Men_NSRCC-Singapore.html',
 'https://www.tripadvisor.com.sg/Restaurant_Review-g294265-d1193730-Reviews-Entre_Nous_creperie-Singapore.html',
 'https://www.tripadvisor.com.sg/Restaurant_Review-g294265-d4611806-Reviews-NOX_Dine_in_the_Dark-Singapore.html']

## Actual Scraping

In [7]:
# get the start value for every new page in yelp
start_of_new_page = [str(i) for i in range(0,501,10)]
# debug
print(start_of_new_page, end=' ')
print(f'\nNumber of pages scraped: {len(start_of_new_page)}')

['0', '10', '20', '30', '40', '50', '60', '70', '80', '90', '100', '110', '120', '130', '140', '150', '160', '170', '180', '190', '200', '210', '220', '230', '240', '250', '260', '270', '280', '290', '300', '310', '320', '330', '340', '350', '360', '370', '380', '390', '400', '410', '420', '430', '440', '450', '460', '470', '480', '490', '500'] 
Number of pages scraped: 51


## For initial scrape

In [8]:
def scraper(list_of_links, num_req, start, end):
    # redeclaring lists to store data in multiple values
    cust_names = []
    cust_ratings = []
    cust_comments = []
    res_names = []
    res_types = []

    # counter
    count = 0

    # flag variable to check the scrape
    # if unsuccessful scrape, try again
    unsuccessful = True

    # preparing the monitoring of the loop
    start_time = time()

    ### ----- 

    # for every comment in the interval of 10
    for link_raw in list_of_links[start:end+1]:
        loop_time = time()
        count+=1 # increment count to determine which link it is being scraped
        requests=1 # reset requests count for different webpage
        # print(f'----- LINK {count} -----')
        for pageStart in start_of_new_page:

            # Break the loop if the number of requests is greater than expected
            if requests > num_req:
                #warn('Number of requests was greater than expected.')
                break

            unsuccessful = True
            fail_count = 0
            repeat = 0

            while unsuccessful:
                # make a get request
                #response = get(f'https://www.yelp.com/biz/jumbo-seafood-singapore-4?start={pageStart}')
                #response = get(f'https://www.tripadvisor.com.sg/Restaurant_Review-g294265-d7348336-Reviews-or{pageStart}-Sunday_Folks-Singapore.html')
                link_array = link_raw.split('Reviews-')
                link = link_array[0] + 'Reviews-' + 'or' + str(pageStart) + "-" +link_array[1]
                # print(link)
                response = get(link)

                # pause the loop
                sleep(randint(2,5))

                # monitor the requests
                elapsed_time = time() - start_time
                print(f'LINK {count+start} REQUEST {requests}; Frequency: {requests/elapsed_time} requests/s')

                # Parse the content of the request with BeautifulSoup
                page_html = BeautifulSoup(response.text, 'html.parser')

                # get the comment container for all 20 comments in a page
                comment_containers = page_html.find_all('div', class_='review-container')

                if len(comment_containers) != 0:
                    print(f"REQUEST {requests}: SUCCESS --> Failed Count: {fail_count}")
                    clear_output(wait = True)
                    unsuccessful = False
                else:
                    fail_count+=1
                    repeat+=1
                    #print(f"Request {requests}: unsuccessful scrape") # debug
                if repeat >= 5:
                    print("Repeated 5 times --> cannot scrape")
                    break

            requests += 1

            # for every comments in 10
            for com in comment_containers:
                # in case the scrape fail for that particular entry due to html tag issue
                try:
                    # append the restaurant name and type
                    res_names.append(links_df["Restaurant Name"][count+start-1])
                    res_types.append(links_df["Restaurant Type"][count+start-1])

                    # scrape the customer name
                    cust_name = com.find('div', class_='info_text pointer_cursor').text
                    cust_names.append(cust_name)

                    # scrape the customer ratings
                    cust_rating_raw = com.find('div', class_ ='ui_column is-9').span['class']

                    # convert the rating to an integer
                    cust_rating = 0
                    if cust_rating_raw[1] == 'bubble_50':
                        cust_rating = 5
                    elif cust_rating_raw[1] == 'bubble_40':
                        cust_rating = 4
                    elif cust_rating_raw[1] == 'bubble_30':
                        cust_rating = 3
                    elif cust_rating_raw[1] == 'bubble_20':
                        cust_rating = 2
                    elif cust_rating_raw[1] == 'bubble_10':
                        cust_rating = 1
                    else:
                        cust_rating = 0
                    cust_ratings.append(cust_rating)

                    # scrape the customer comment
                    cust_comment_raw = com.find('div', class_='entry').text

                    # get rid of the unwanted characters like "..." and "More"
                    comment_ = cust_comment_raw.replace("..."," ")

                    # remove instances with "More" tab if there are more words
                    cust_comment = ""
                    if comment_[-4:] == "More":
                        cust_comment = comment_[:-4]
                    else:
                        cust_comment = comment_

                    cust_comments.append(cust_comment)
                except:
                    print(f'error in request {requests}')
                    continue

            # Throw a warning for non-200 status codes
            if response.status_code != 200:
                warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        # check time needed to exceute one link
        print(f'Time taken for scraping link {count+start}: {time() - loop_time} seconds')

    print('DONE')
    return cust_names, cust_ratings, cust_comments, res_names, res_types

In [9]:
# scraper call function
def scraper_call(list_of_links, num_req, start, end):
    cust_names, cust_ratings, cust_comments, res_names, res_types = scraper(list_of_links = list_of_links, 
                                                                            num_req=num_req, 
                                                                            start=start, 
                                                                            end=end)
    print(f'Number of entries: {len(cust_names)}')
    review = pd.DataFrame({
        'Restaurant Name': res_names,
        'Restaurant Type': res_types,
        'Reviewer\'s Name': cust_names,
        'Rating': cust_ratings,
        'Comment': cust_comments, 
    })

    print(review.info())
    if start == 0:
        review.to_csv('./data/trip-advisor-comments.csv', mode='a', index=False)
    else:
        review.to_csv('./data/trip-advisor-comments.csv', mode='a', index=False, header=False)

## Scrape V2

In [10]:
def scraper_v2(list_of_links, num_req, start, end):
    # redeclaring lists to store data in multiple values
    cust_names = []
    cust_ratings = []
    cust_comments = []
    res_names = []
    res_types = []

    # counter
    count = 0

    # flag variable to check the scrape
    # if unsuccessful scrape, try again
    unsuccessful = True

    # preparing the monitoring of the loop
    start_time = time()

    ### ----- 

    # for every comment in the interval of 10
    for link_raw in list_of_links[start:end+1]:
        loop_time = time()
        count+=1 # increment count to determine which link it is being scraped
        requests=1 # reset requests count for different webpage
        # print(f'----- LINK {count} -----')
        for pageStart in start_of_new_page:

            # Break the loop if the number of requests is greater than expected
            if requests > num_req:
                #warn('Number of requests was greater than expected.')
                break

            unsuccessful = True
            fail_count = 0
            repeat = 0

            while unsuccessful:
                # make a get request
                #response = get(f'https://www.yelp.com/biz/jumbo-seafood-singapore-4?start={pageStart}')
                #response = get(f'https://www.tripadvisor.com.sg/Restaurant_Review-g294265-d7348336-Reviews-or{pageStart}-Sunday_Folks-Singapore.html')
                link_array = link_raw.split('Reviews-')
                link = link_array[0] + 'Reviews-' + 'or' + str(pageStart) + "-" +link_array[1]
                # print(link)
                response = get(link)

                # pause the loop
                sleep(randint(2,5))

                # monitor the requests
                elapsed_time = time() - start_time
                print(f'LINK {count+start} REQUEST {requests}; Frequency: {requests/elapsed_time} requests/s')

                # Parse the content of the request with BeautifulSoup
                page_html = BeautifulSoup(response.text, 'html.parser')

                # get the comment container for all 20 comments in a page
                comment_containers = page_html.find_all('div', class_='review-container')

                if len(comment_containers) != 0:
                    print(f"REQUEST {requests}: SUCCESS --> Failed Count: {fail_count}")
                    clear_output(wait = True)
                    unsuccessful = False
                else:
                    fail_count+=1
                    repeat+=1
                    #print(f"Request {requests}: unsuccessful scrape") # debug
                if repeat >= 5:
                    print("Repeated 5 times --> cannot scrape")
                    break

            requests += 1

            # for every comments in 10
            for com in comment_containers:
                # in case the scrape fail for that particular entry due to html tag issue
                try:
                    # append the restaurant name and type
                    res_names.append(links_df["Restaurant Name"][count+start-1])
                    res_types.append(links_df["Restaurant Type"][count+start-1])

                    # scrape the customer name
                    cust_name = com.find('div', class_='info_text pointer_cursor').text
                    cust_names.append(cust_name)

                    # scrape the customer ratings
                    cust_rating_raw = com.find('div', class_ ='ui_column is-9').span['class']

                    # convert the rating to an integer
                    cust_rating = 0
                    if cust_rating_raw[1] == 'bubble_50':
                        cust_rating = 5
                    elif cust_rating_raw[1] == 'bubble_40':
                        cust_rating = 4
                    elif cust_rating_raw[1] == 'bubble_30':
                        cust_rating = 3
                    elif cust_rating_raw[1] == 'bubble_20':
                        cust_rating = 2
                    elif cust_rating_raw[1] == 'bubble_10':
                        cust_rating = 1
                    else:
                        cust_rating = 0
                    cust_ratings.append(cust_rating)

                    # scrape the customer comment
                    cust_comment_raw = com.find('div', class_='entry').text

                    # get rid of the unwanted characters like "..." and "More"
                    comment_ = cust_comment_raw.replace("..."," ")

                    # remove instances with "More" tab if there are more words
                    cust_comment = ""
                    if comment_[-4:] == "More":
                        cust_comment = comment_[:-4]
                    else:
                        cust_comment = comment_

                    cust_comments.append(cust_comment)
                except:
                    print(f'error in request {requests}')
                    continue

            # Throw a warning for non-200 status codes
            if response.status_code != 200:
                warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        # check time needed to exceute one link
        print(f'Time taken for scraping link {count+start}: {time() - loop_time} seconds')
        
        print(f'Number of entries: {len(cust_names)}')
        
        try:
            # fit dataframe and transfer to csv for every webpage 
            review = pd.DataFrame({
                'Restaurant Name': res_names,
                'Restaurant Type': res_types,
                'Reviewer\'s Name': cust_names,
                'Rating': cust_ratings,
                'Comment': cust_comments, 
            })

            print(review.info())
            if start == 0:
                review.to_csv('./data/trip-advisor-comments.csv', mode='a', index=False)
            else:
                review.to_csv('./data/trip-advisor-comments.csv', mode='a', index=False, header=False)
            clear_output(wait = True)
        
        except:
            print(f"Unable to scrape LINK {count+start} due to unequal elements in the columns. Moving on...")
            # empty the list for the next webpage
            cust_names = []
            cust_ratings = []
            cust_comments = []
            res_names = []
            res_types = []
            continue
            
        # empty the list for the next webpage
        cust_names = []
        cust_ratings = []
        cust_comments = []
        res_names = []
        res_types = []

    print('DONE')

In [11]:
# determine number of requests (each request is 10 entries)
REQUESTS = 10
start_list = [s for s in range(0,1014,50)]
end_list = [e for e in range(49,1014,50)]

In [12]:
# test values
print(start_list)
print(end_list)

[0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000]
[49, 99, 149, 199, 249, 299, 349, 399, 449, 499, 549, 599, 649, 699, 749, 799, 849, 899, 949, 999]


## CALLING OF FUNCTIONS

In [13]:
# link 0 - 49
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[0], end=end_list[0])

Time taken for scraping link 50: 48.78237295150757 seconds
DONE
Number of entries: 5000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Restaurant Name  5000 non-null   object
 1   Restaurant Type  5000 non-null   object
 2   Reviewer's Name  5000 non-null   object
 3   Rating           5000 non-null   int64 
 4   Comment          5000 non-null   object
dtypes: int64(1), object(4)
memory usage: 195.4+ KB
None


In [15]:
# link 50 - 99
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[1], end=end_list[1])

Time taken for scraping link 100: 73.11896085739136 seconds
DONE
Number of entries: 4971
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4971 entries, 0 to 4970
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Restaurant Name  4971 non-null   object
 1   Restaurant Type  4971 non-null   object
 2   Reviewer's Name  4971 non-null   object
 3   Rating           4971 non-null   int64 
 4   Comment          4971 non-null   object
dtypes: int64(1), object(4)
memory usage: 194.3+ KB
None


In [16]:
# link 100 - 149
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[2], end=end_list[2])

Time taken for scraping link 150: 74.75528454780579 seconds
DONE
Number of entries: 5000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Restaurant Name  5000 non-null   object
 1   Restaurant Type  5000 non-null   object
 2   Reviewer's Name  5000 non-null   object
 3   Rating           5000 non-null   int64 
 4   Comment          5000 non-null   object
dtypes: int64(1), object(4)
memory usage: 195.4+ KB
None


In [17]:
# link 150 - 199
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[3], end=end_list[3])

Time taken for scraping link 200: 63.61752152442932 seconds
DONE
Number of entries: 4985
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4985 entries, 0 to 4984
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Restaurant Name  4985 non-null   object
 1   Restaurant Type  4985 non-null   object
 2   Reviewer's Name  4985 non-null   object
 3   Rating           4985 non-null   int64 
 4   Comment          4985 non-null   object
dtypes: int64(1), object(4)
memory usage: 194.9+ KB
None


In [18]:
# link 200 - 249
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[4], end=end_list[4])

Time taken for scraping link 250: 76.94319868087769 seconds
DONE
Number of entries: 4880
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4880 entries, 0 to 4879
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Restaurant Name  4880 non-null   object
 1   Restaurant Type  4880 non-null   object
 2   Reviewer's Name  4880 non-null   object
 3   Rating           4880 non-null   int64 
 4   Comment          4880 non-null   object
dtypes: int64(1), object(4)
memory usage: 190.8+ KB
None


In [20]:
# link 250 - 299
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[5], end=end_list[5])

Time taken for scraping link 300: 68.3052167892456 seconds
DONE
Number of entries: 4965
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4965 entries, 0 to 4964
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Restaurant Name  4965 non-null   object
 1   Restaurant Type  4965 non-null   object
 2   Reviewer's Name  4965 non-null   object
 3   Rating           4965 non-null   int64 
 4   Comment          4965 non-null   object
dtypes: int64(1), object(4)
memory usage: 194.1+ KB
None


In [21]:
# link 300 - 349
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[6], end=end_list[6])

Time taken for scraping link 350: 64.7284643650055 seconds
DONE
Number of entries: 4984
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4984 entries, 0 to 4983
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Restaurant Name  4984 non-null   object
 1   Restaurant Type  4984 non-null   object
 2   Reviewer's Name  4984 non-null   object
 3   Rating           4984 non-null   int64 
 4   Comment          4984 non-null   object
dtypes: int64(1), object(4)
memory usage: 194.8+ KB
None


In [None]:
# got error
# link 350 - 399
#scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[7], end=end_list[7])

In [26]:
# link 350 - 359
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=350, end=359)

Time taken for scraping link 360: 60.504833698272705 seconds
DONE
Number of entries: 992
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 992 entries, 0 to 991
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Restaurant Name  992 non-null    object
 1   Restaurant Type  992 non-null    object
 2   Reviewer's Name  992 non-null    object
 3   Rating           992 non-null    int64 
 4   Comment          992 non-null    object
dtypes: int64(1), object(4)
memory usage: 38.9+ KB
None


In [27]:
# link 360 - 369
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=360, end=369)

Time taken for scraping link 370: 59.58866786956787 seconds
DONE
Number of entries: 990
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Restaurant Name  990 non-null    object
 1   Restaurant Type  990 non-null    object
 2   Reviewer's Name  990 non-null    object
 3   Rating           990 non-null    int64 
 4   Comment          990 non-null    object
dtypes: int64(1), object(4)
memory usage: 38.8+ KB
None


In [28]:
# link 370 - 379
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=370, end=379)

Time taken for scraping link 380: 73.09391021728516 seconds
DONE
Number of entries: 1000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Restaurant Name  1000 non-null   object
 1   Restaurant Type  1000 non-null   object
 2   Reviewer's Name  1000 non-null   object
 3   Rating           1000 non-null   int64 
 4   Comment          1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.2+ KB
None


In [29]:
# link 380 - 389
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=380, end=389)

Time taken for scraping link 390: 72.13982772827148 seconds
DONE
Number of entries: 991
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 991 entries, 0 to 990
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Restaurant Name  991 non-null    object
 1   Restaurant Type  991 non-null    object
 2   Reviewer's Name  991 non-null    object
 3   Rating           991 non-null    int64 
 4   Comment          991 non-null    object
dtypes: int64(1), object(4)
memory usage: 38.8+ KB
None


In [31]:
# link 390 - 398 -> got problem in 399
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=390, end=398)

Time taken for scraping link 399: 68.3887779712677 seconds
DONE
Number of entries: 891
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Restaurant Name  891 non-null    object
 1   Restaurant Type  891 non-null    object
 2   Reviewer's Name  891 non-null    object
 3   Rating           891 non-null    int64 
 4   Comment          891 non-null    object
dtypes: int64(1), object(4)
memory usage: 34.9+ KB
None


In [12]:
# link 400 - 449
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[8], end=end_list[8])

Time taken for scraping link 450: 61.41247820854187 seconds
DONE
Number of entries: 4964
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4964 entries, 0 to 4963
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Restaurant Name  4964 non-null   object
 1   Restaurant Type  4964 non-null   object
 2   Reviewer's Name  4964 non-null   object
 3   Rating           4964 non-null   int64 
 4   Comment          4964 non-null   object
dtypes: int64(1), object(4)
memory usage: 194.0+ KB
None


In [13]:
# link 450 - 499
scraper_call(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[9], end=end_list[9])

Time taken for scraping link 500: 89.59984612464905 seconds
DONE
Number of entries: 4939
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4939 entries, 0 to 4938
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Restaurant Name  4939 non-null   object
 1   Restaurant Type  4939 non-null   object
 2   Reviewer's Name  4939 non-null   object
 3   Rating           4939 non-null   int64 
 4   Comment          4939 non-null   object
dtypes: int64(1), object(4)
memory usage: 193.1+ KB
None


## made some code modification from this point onwards to curb with 'arrays must all be same length' problem

In [20]:
# link 500 - 549
scraper_v2(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[10], end=end_list[10])

Time taken for scraping link 550: 76.73704671859741 seconds
Number of entries: 100
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Restaurant Name  100 non-null    object
 1   Restaurant Type  100 non-null    object
 2   Reviewer's Name  100 non-null    object
 3   Rating           100 non-null    int64 
 4   Comment          100 non-null    object
dtypes: int64(1), object(4)
memory usage: 4.0+ KB
None
DONE


In [22]:
# link 550 - 599
scraper_v2(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[11], end=579)

Done


In [23]:
scraper_v2(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=580, end=end_list[11])

DONE


In [24]:
# link 600 - 649
scraper_v2(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[12], end=end_list[12])

DONE


In [25]:
# link 650 - 699
scraper_v2(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[13], end=end_list[13])

DONE


In [26]:
# link 700 - 749
scraper_v2(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[14], end=end_list[14])

DONE


In [27]:
# link 750 - 799
scraper_v2(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[15], end=end_list[15])

DONE


In [28]:
# link 800 - 849
scraper_v2(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[16], end=end_list[16])

DONE


In [29]:
# link 850 - 899
scraper_v2(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[17], end=end_list[17])

DONE


In [48]:
# link 900 - 949
scraper_v2(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[18], end=end_list[18])

DONE


In [13]:
# link 950 - 999
scraper_v2(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[19], end=end_list[19])

DONE


In [14]:
# link 1000 - 1013
scraper_v2(list_of_links=LIST_OF_LINKS, num_req=REQUESTS, start=start_list[20], end=1012)

DONE
