Scrape All Reviews in All URLs

In [23]:
#load necessary libraries
from selenium import webdriver
import time
import chromedriver_binary
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
import pandas as pd
import re


#disable build check to if latest version of Chrome is not supported
service = Service(service_args=['--disable-build-check'])

#open the window size to simulate mobile view
options = webdriver.ChromeOptions()
options.add_argument("window-size=600,800")
driver = webdriver.Chrome(service=service, options=options)


The chromedriver version (128.0.6582.0) detected in PATH at C:\Users\22507\anaconda3\lib\site-packages\chromedriver_binary\chromedriver.exe might not be compatible with the detected chrome version (126.0.6478.127); currently, chromedriver 126.0.6478.126 is recommended for chrome 126.*, so it is advised to delete the driver in PATH and retry


In [24]:
def get_all_reviews(url):

    driver.get(url)

    page_number = 1

    restaurant_name = ""
    restaurant_names = []
    review_contents = []
    stars = []
    review_dates = []

    # scrape each page
    def scrape_page():

        # Review Date
        try:
            review_elements = driver.find_elements(By.CSS_SELECTOR, 'span.y-css-wfbtsu')
            date_pattern = re.compile(r'\b[A-Za-z]{3} \d{1,2}, \d{4}\b')
            for review_element in review_elements:
                text = review_element.text
                if date_pattern.match(text):
                    review_dates.append(text)
        except:
            pass

        # Review's Rating
        try:
            numbers = driver.find_elements(By.CSS_SELECTOR,
                                           'div.arrange-unit__09f24__rqHTg.y-css-1iy1dwt .y-css-9tnml4[role="img"][aria-label*="star rating"]')
            for number in numbers:
                aria_label = number.get_attribute("aria-label")
                star = aria_label.split()[0]
                stars.append(star)
        except:
            pass

        # Review Contents
        try:
            reviews = driver.find_elements(By.CSS_SELECTOR, 'p.comment__09f24__D0cxf.y-css-h9c2fl')
            for review in reviews:
                clean_text = review.text.replace('\n', ' ')
                review_contents.append(clean_text)
        except:
            pass


    # scrape first page
    time.sleep(5)
    try:
        print('Page 1 ...')
        scrape_page()
    except TimeoutException:
        print("Timeout while waiting for the first page elements to load.")

    # scrape other pages
    while True:
        try:
            # click next page button
            load_next_page_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH,
                                            '//a[@class="next-link navigation-button__09f24__m9qRz y-css-7ln3jw" and @aria-label="Next"]'))
            )

            # able to click
            if load_next_page_button.is_enabled():
                load_next_page_button.click()
                print("Clicked 'Next' button...Loading...")
                page_number += 1
                print(f"Page {page_number} ...")

                time.sleep(3)

                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'span.y-css-wfbtsu'))
                )
                scrape_page()
            else:
                print("Next button is either not displayed or not enabled")
                break

        except TimeoutException:
            print(f"No more 'Next' button found. Stopping at page {page_number}.")
            break

    # Restaurant Names
    length = len(stars)
    try:
        name = driver.find_element(By.CSS_SELECTOR, "h1")
        restaurant_name = name.text
        restaurant_names = [restaurant_name] * length
    except:
        pass

    review_list = list(zip(restaurant_names, stars, review_dates, review_contents))

    return review_list

In [25]:
def get_all_restaurants(url,num_pages):

    driver.get(url)

    restaurant_urls = []

    try:
        for i in range(num_pages):
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a.y-css-12ly5yx'))
            )

            restaurants = driver.find_elements(By.CSS_SELECTOR, 'a.y-css-12ly5yx')
            urls = [restaurant.get_attribute('href') for restaurant in restaurants]
            restaurant_urls.extend(urls)
            restaurant_urls = list(set(restaurant_urls))

            # find and click next page button
            try:
                load_next_page_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.pagination-button__09f24__kbFYf.y-css-1ewzev[data-button="true"]'))
                )

                load_next_page_button.click()

                print(f"Page {i + 1}...")
                print("Clicked 'Next Page' button...Loading...")

                WebDriverWait(driver, 10).until(
                    EC.invisibility_of_element_located((By.CSS_SELECTOR, 'button.pagination-button__09f24__kbFYf.y-css-1ewzev[data-button="false"]'))
                )
                time.sleep(3)

            except Exception as e:
                print(f"Next page button not found or an error occurred: {e}")
                break

    except Exception as e:
        print(f"An error occurred: {e}")

    return restaurant_urls

In [26]:
restaurant_urls = get_all_restaurants("https://www.yelp.com/search?find_desc=Restaurants&find_loc=Singapore",24)

Page 1...
Clicked 'Next Page' button...Loading...
Page 2...
Clicked 'Next Page' button...Loading...
Page 3...
Clicked 'Next Page' button...Loading...
Page 4...
Clicked 'Next Page' button...Loading...
Page 5...
Clicked 'Next Page' button...Loading...
Page 6...
Clicked 'Next Page' button...Loading...
Page 7...
Clicked 'Next Page' button...Loading...
Page 8...
Clicked 'Next Page' button...Loading...
Page 9...
Clicked 'Next Page' button...Loading...
Page 10...
Clicked 'Next Page' button...Loading...
Page 11...
Clicked 'Next Page' button...Loading...
Page 12...
Clicked 'Next Page' button...Loading...
Page 13...
Clicked 'Next Page' button...Loading...
Page 14...
Clicked 'Next Page' button...Loading...
Page 15...
Clicked 'Next Page' button...Loading...
Page 16...
Clicked 'Next Page' button...Loading...
Page 17...
Clicked 'Next Page' button...Loading...
Page 18...
Clicked 'Next Page' button...Loading...
Page 19...
Clicked 'Next Page' button...Loading...
Page 20...
Clicked 'Next Page' button...

In [27]:
all_reviews = []

for restaurant_url in restaurant_urls:
    restaurant_reviews = get_all_reviews(restaurant_url)
    all_reviews.append(restaurant_reviews)

    time.sleep(0.5)

nested_list = all_reviews
flattened_list = [item for sublist in nested_list for item in (sublist if isinstance(sublist, list) else [sublist])]

df = pd.DataFrame(flattened_list, columns=['Restaurant Name', 'Review Rating', 'Review Date', 'Review Content'])
df

Page 1 ...
Clicked 'Next' button...Loading...
Page 2 ...
Clicked 'Next' button...Loading...
Page 3 ...
Clicked 'Next' button...Loading...
Page 4 ...
No more 'Next' button found. Stopping at page 4.
Page 1 ...
No more 'Next' button found. Stopping at page 1.
Page 1 ...
No more 'Next' button found. Stopping at page 1.
Page 1 ...
No more 'Next' button found. Stopping at page 1.
Page 1 ...
No more 'Next' button found. Stopping at page 1.
Page 1 ...
No more 'Next' button found. Stopping at page 1.
Page 1 ...
No more 'Next' button found. Stopping at page 1.
Page 1 ...
No more 'Next' button found. Stopping at page 1.
Page 1 ...
Clicked 'Next' button...Loading...
Page 2 ...
No more 'Next' button found. Stopping at page 2.
Page 1 ...
No more 'Next' button found. Stopping at page 1.
Page 1 ...
Clicked 'Next' button...Loading...
Page 2 ...
No more 'Next' button found. Stopping at page 2.
Page 1 ...
No more 'Next' button found. Stopping at page 1.
Page 1 ...
No more 'Next' button found. Stopping a

Unnamed: 0,Restaurant Name,Review Rating,Review Date,Review Content
0,Usman Restaurant,4,"Jul 30, 2022",Daal Keema fry & pala paneer were good Mix ta...
1,Usman Restaurant,5,"May 20, 2023",Very good Pakistani restaurant. I ate chicken ...
2,Usman Restaurant,4,"Mar 22, 2016",Cash only. I hate cash only places. Cleanline...
3,Usman Restaurant,4,"Apr 19, 2017","it's late, and you're hungry. No Jack in the C..."
4,Usman Restaurant,5,"Jul 19, 2014","So, we were just exploring Little India and I ..."
...,...,...,...,...
4639,Trattoria Lafiandra,2,"May 11, 2014",When we arrived we ignored for a very long tim...
4640,Trattoria Lafiandra,5,"Aug 3, 2015",One of my favourite Italian restaurants in Asi...
4641,Trattoria Lafiandra,1,"May 15, 2014",Probable racist chef-owner refused to serve me...
4642,Katong Keah Kee,4,"Jan 15, 2017",The oyster pancake here is inbetween Chaozhou ...


In [28]:
csv_file_path = 'C:/Users/22507/Desktop/NUS/project/restaurant_reviews.csv'
df.to_csv(csv_file_path, index=False, encoding='utf-8')

print(f"CSV file saved at: {csv_file_path}")

CSV file saved at: C:/Users/22507/Desktop/NUS/project/restaurant_reviews.csv


In [29]:
driver.quit()