In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time, csv
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.relative_locator import locate_with

## Scrape Function

In [5]:
def scrape(food,city):

    # Go to Yelp.com and accept cookies
    page_url = 'https://www.yelp.com'
    driver.get(page_url)
    cookies = WebDriverWait(driver,5).until(EC.element_to_be_clickable((By.ID, 'onetrust-accept-btn-handler'))).click()

    # Find search text box
    description = driver.find_element(By.ID, 'search_description') 
    location = driver.find_element(By.ID, 'search_location') 

    # Delete any initial input
    description.send_keys(Keys.CONTROL, 'a')
    description.send_keys(Keys.DELETE)
    location.send_keys(Keys.CONTROL, 'a')
    location.send_keys(Keys.DELETE)

    # Send user input in search box
    description.send_keys(food)
    location.send_keys(city)

    # Click search button
    button = driver.find_element(By.CSS_SELECTOR, '[data-testid="suggest-submit"]') 
    button.click()                         
    driver.implicitly_wait(10)
    
    # Check if the input was correct    
    not_valid_imput = driver.find_element(By.CSS_SELECTOR, '[class="css-oxqmph"]') 

    substring1 = "No results"
    substring2 = "Sorry"
     
    if substring1 in not_valid_imput.text:
        print("No results for:", food, city)
        
    elif substring2 in not_valid_imput.text:
        print("Sorry, but we didn't understand the location you entered.")
    
    # If the imput is correct find and click the 1st result
    else:           
        restaurants = driver.find_elements(By.CSS_SELECTOR, 'h3.css-1agk4wl span.css-1egxyvc')
        addresses = driver.find_elements(By.CSS_SELECTOR, 'p.css-dzq7l1 span.css-chan6m')
    
        print("The proposed restaurants are: ")
        print()
    
        first_result = []
        for i in range(len(restaurants)):
            rest_text = []
            rest_text.append(restaurants[i].text)
            rest_text.append(addresses[i].text)
            print(rest_text)

            if rest_text[0][0:2] == '1.':   # We use 1. to exlude 10 value
                restaurants[i].click()      # Click the restaurant
                first_result.append(restaurants[i].text) # Keep the reviewed restaurant     
                first_result.append(addresses[i].text)
                
        print()
        print("The reviewed restaurant is:  ", first_result)

        # Because the restaurant open in a new tab we must change tab
        original_window = driver.current_window_handle
        for window_handle in driver.window_handles:
            if window_handle != original_window:
                driver.switch_to.window(window_handle)
                break

        # Name of the restaurant (2nd tab)       
        restaurant_name = driver.find_element(By.CSS_SELECTOR, 'h1.css-1se8maq').text
        
        # Open a csv writer for metadata
        outpath = f'data/metadata.csv'
        f = open(outpath, 'w', encoding = 'utf8')
        writer = csv.writer(f, lineterminator='\n')
        writer.writerow(['food','city','rest_name','rest_location'])
        writer.writerow([food, city, restaurant_name, first_result[1]])
        f.close
        
        # Open a new csv writer for data reviews
        outpath = f'data/yelp_reviews_{food}_{city}.csv'
        f = open(outpath, 'w', encoding = 'utf8')
        writer = csv.writer(f, lineterminator='\n')
        writer.writerow(['date','rating','review'])

        # In the restaurant's tab
        while True:
             
            # Scroll down    
            driver.execute_script('window,scrollTo(0,document.body.scrollHeight)')  
                
            # Elements used for location help
            navigation_bar = driver.find_element(By.CSS_SELECTOR, '[class=" pagination__09f24__VRjN4 border-color--default__09f24__NPAKY"]')
            filter_bar = driver.find_element(By.CSS_SELECTOR, '[class=" arrange__09f24__LDfbs gutter-auto__09f24__W9jlL grid__09f24__S5_aJ vertical-align-bottom__09f24__A9C03 border-color--default__09f24__NPAKY"]')

            # Dates, Ratings, Reviews elements 
            dates = driver.find_elements(locate_with(By.CSS_SELECTOR, 'span.css-chan6m').above(navigation_bar).below(filter_bar))
            ratings = driver.find_elements(locate_with(By.CSS_SELECTOR, '[class=" five-stars__09f24__mBKym five-stars--regular__09f24__DgBNj display--inline-block__09f24__fEDiJ border-color--default__09f24__NPAKY"]').above(navigation_bar).below(filter_bar))
            reviews = driver.find_elements(locate_with(By.CSS_SELECTOR, 'span.raw__09f24__T4Ezm').above(navigation_bar).below(filter_bar))

            # Write to csv
            for i in range(len(reviews)):
                date = dates[i].text
                rating = ratings[i].get_attribute('aria-label')
                review = reviews[i].text.replace('\n\n'," ").replace('\n'," ")   
                writer.writerow([date, rating, review])
                       
            # Checking for last page in reviews
            try:
                next_button_check = driver.find_element(By.CSS_SELECTOR, '[class="icon--24-chevron-right-v2 navigation-button-icon__09f24__Bmrde navigation-button-icon--disabled__09f24__z98Q4 css-1lx34my"]'  )
                last_page = True
            except NoSuchElementException:
                last_page = False

            if last_page:
                break
  
            # Click on next page in reviews
            next_button = driver.find_element(By.CSS_SELECTOR, '[class="next-link navigation-button__09f24__m9qRz css-144i0wq"]')
            next_button.click()    
            time.sleep(3)

        f.close 
        
        print()
        print("The scrapping results are in: ", outpath)

## Scrape Dataset

In [6]:
driver = webdriver.Chrome(ChromeDriverManager().install())

  driver = webdriver.Chrome(ChromeDriverManager().install())


In [7]:
scrape('Indian Food','London')

The proposed restaurants are: 

['1. Dishoom', 'Covent Garden']
['2. Dishoom', 'Soho']
['3. Gymkhana', 'Mayfair']
['4. Tamarind Kitchen', 'Soho']
['5. Punjab Restaurant', 'Covent Garden']
['6. Roti Chai', 'Marylebone']
['7. The Rajdoot', 'Marylebone']
['8. Trishna', 'Marylebone']
['9. Shah Tandoori', 'Euston']
['10. Dishoom - Kensington', 'Kensington']

The reviewed restaurant is:   ['1. Dishoom', 'Covent Garden']

The scrapping results are in:  yelp_reviews_Indian Food_London.csv
