In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, ElementNotInteractableException, StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import pandas as pd
import random

def scrape_restaurant(driver, url):
    global rest_count
    driver.get(url)

    try:
        rest_name = driver.find_element(By.XPATH, '//h1').text
    except NoSuchElementException:
        rest_name = "Name not found"
    
    try:
        number_of_reviews = driver.find_element(By.XPATH, '//*[@id="reviewInfo"]/span[2]').text
    except NoSuchElementException:
        number_of_reviews = "Reviews not found"
    
    try:
        rating = driver.find_element(By.XPATH, './/span[contains(@class, "m1KNa9XKCHY- C7Tp-bANpE4-")]').text
    except NoSuchElementException:
        rating = "Rating not found"
    
    try:
        food_type = driver.find_element(By.XPATH, '//*[@id="cuisineInfo"]/span[2]').text
    except NoSuchElementException:
        food_type = "Food type not found"
    
    try:
        coupon = driver.find_element(By.XPATH, '//div[contains(@id, "priceBandInfo")]//span[last()]').text
    except NoSuchElementException:
        coupon = "Coupon not found"

    try:
        food = driver.find_element(By.XPATH, '//span[text()="Food"]/preceding-sibling::span').text
    except NoSuchElementException:
        food = "Food not found"

    try:
        service = driver.find_element(By.XPATH, '//span[text()="Service"]/preceding-sibling::span').text
    except NoSuchElementException:
        service = "Service not found"

    try:
        ambience = driver.find_element(By.XPATH, '//span[text()="Ambience"]/preceding-sibling::span').text
    except NoSuchElementException:
        ambience = "Ambience not found"

    try:
        value = driver.find_element(By.XPATH, '//span[text()="Value"]/preceding-sibling::span').text
    except NoSuchElementException:
        value = "Value not found"
        
    # Scraping image URL (assumed the second image is the restaurant's image)
    try:
        image_elements = driver.find_elements(By.XPATH, '//img[contains(@src, "otstatic.com")]')
        if len(image_elements) > 1:  # Skip logo and take the second image
            second_image_url = image_elements[1].get_attribute('src')
        else:
            second_image_url = "No valid image found"
    except NoSuchElementException:
        second_image_url = "No image found"
    
    # Initialize an empty list to store all comments
    all_comments = []
    max_pages = 3
    current_page = 1
    
    # Scrape comments from 3 pages
    while current_page <= max_pages:
        try:
            # Extract comments on the current page
            comments_elements = driver.find_elements(By.XPATH, './/span[contains(@class, "l9bbXUdC9v0- ZatlKKd1hyc- ukvN6yaH1Ds-")]')
            comments = " ".join([element.text for element in comments_elements[1:]]) if comments_elements else "Comments not found"
            all_comments.append(comments)
            print(f"Page {current_page} Comments:", comments)  # Debug print
            
            # Look for the "Next" button to navigate to the next page of comments
            try:
                # Re-locate the "Next" button on each iteration to avoid StaleElementReferenceException
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, '//a[@aria-label="Go to the next page"]'))
                )
                driver.execute_script("arguments[0].scrollIntoView();", next_button)
                time.sleep(1)
                driver.execute_script("arguments[0].click();", next_button)
                time.sleep(2)
                current_page += 1
            except (TimeoutException, ElementNotInteractableException, StaleElementReferenceException):
                print("Failed to locate or click the next button. Ending pagination.")
                break  # If the next button is not clickable or not found, break the loop
        except NoSuchElementException:
            break
    
    # Join all comments into a single string
    all_comments = " ".join(all_comments)

    try:
        about_rest = driver.find_element(By.XPATH, './/span[contains(@class, "l9bbXUdC9v0- ZatlKKd1hyc- ukvN6yaH1Ds- l-AMWW5ZrIg-")]').text
    except NoSuchElementException:
        about_rest = "About Restaurant not found"
    print("About Restaurant:", about_rest)  # Debug print

    # Append scraped data to the initialized dictionary
    data["url"].append(url)
    data["rest_name"].append(rest_name)
    data["number_of_reviews"].append(number_of_reviews)
    data["rating"].append(rating)
    data["food_type"].append(food_type)
    data["coupon"].append(coupon)
    data["food"].append(food)
    data["service"].append(service)
    data["ambience"].append(ambience)
    data["value"].append(value)
    data["about_rest"].append(about_rest)
    data["comments"].append(all_comments)
    data["image_url"].append(second_image_url)  # Add image URL to the data

    rest_count += 1
    print(f"Total Restaurants scraped: {rest_count}\n")


def get_restaurant_links(driver):
    all_urls = []
    time.sleep(random.randint(2, 3))  # Adding random delay to avoid detection

    # Scroll increment
    scroll_increment = 500  # Scroll down by 500 pixels at a time

    # Perform multiple scrolls (controlled by max_scrolls)
    for _ in range(22):
        # Scroll down by the increment
        driver.execute_script(f"window.scrollBy(0, {scroll_increment});")

        # Wait for restaurant links to become visible and ready
        try:
            WebDriverWait(driver, 10).until(
                EC.visibility_of_element_located((By.XPATH, "//a[contains(@class, 'qCITanV81-Y-')]"))
            )
        except TimeoutException:
            print("Timeout waiting for restaurant links to become visible")
            break

        # Find all restaurant link elements
        elements = driver.find_elements(By.XPATH, "//a[contains(@class, 'qCITanV81-Y-')]")

        # Add the href from each element to the list, avoiding duplicates
        for elem in elements:
            url = elem.get_attribute('href')
            if url and url not in all_urls:
                all_urls.append(url)
        
    print(f"Total URLs collected so far: {len(all_urls)}")
    return all_urls


def click_next_page(driver):
    try:
        # Locate the "Next Page" button by aria-label
        next_button = driver.find_element(By.XPATH, "//a[@aria-label='Go to the next page']")
        
        # Scroll into view
        driver.execute_script("arguments[0].scrollIntoView();", next_button)
        
        # Wait until the element is clickable
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//a[@aria-label='Go to the next page']")))
        
        # Perform click action
        next_button.click()
        return True
    except (NoSuchElementException, ElementClickInterceptedException, ElementNotInteractableException, TimeoutException) as e:
        print(f"Could not click next page: {e}")
        return False


# Data structure to store the restaurant details
data = {
    "url": [],
    "rest_name": [],
    "number_of_reviews": [],
    "rating": [],
    "food_type": [],
    "coupon": [],
    "food": [],
    "service": [],
    "ambience": [],
    "value": [],
    "about_rest": [],
    "comments": [],
    "image_url": []  # Added image_url to store the image links
}

# Set up the Selenium WebDriver (e.g., Chrome)
search_url = 'https://www.opentable.com/s?dateTime=2024-09-28T19%3A00%3A00&covers=2&latitude=37.780885&longitude=-122.2852606&shouldUseLatLongSearch=false&originCorrelationId=f67565c9-ee78-45ce-863b-3b411800a93b'
driver = webdriver.Chrome()
driver.get(search_url)

all_urls = []
rest_count = 0


while True:
    # Scrape restaurant URLs on the current page
    urls = get_restaurant_links(driver)
    all_urls.extend(urls)

    # Try to go to the next page; if no next page, break the loop
    if not click_next_page(driver):
        print("No more pages to navigate.")
        break

# Scrape data for each restaurant URL
for url in all_urls:
    scrape_restaurant(driver, url)

# Save the data to a CSV file
pd.DataFrame(data).to_csv('restaurant_data_with_images.csv', index=False)

# Close the browser
driver.quit()



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Maria\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Maria\anaconda3\Lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
    app.start()
  File "C:\Users\Maria\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:\Users\Maria\anaconda3\Lib\site-packa

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Maria\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Maria\anaconda3\Lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
    app.start()
  File "C:\Users\Maria\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:\Users\Maria\anaconda3\Lib\site-packa

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Maria\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Maria\anaconda3\Lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
    app.start()
  File "C:\Users\Maria\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:\Users\Maria\anaconda3\Lib\site-packa

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Maria\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "C:\Users\Maria\anaconda3\Lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
    app.start()
  File "C:\Users\Maria\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start
    self.io_loop.start()
  File "C:\Users\Maria\anaconda3\Lib\site-packa

AttributeError: _ARRAY_API not found

Total URLs collected so far: 53
Total URLs collected so far: 53
Total URLs collected so far: 53
Total URLs collected so far: 53
Could not click next page: Message: element click intercepted: Element <a href="/" class="ojKcSDzr190- y4S9mw-uCFI- g-dxt-fQ2ZU- C7Tp-bANpE4-" aria-label="Go to the next page">...</a> is not clickable at point (981, 395). Other element would receive the click: <li>...</li>
  (Session info: chrome=129.0.6668.90)
Stacktrace:
	GetHandleVerifier [0x00007FF71746B645+29573]
	(No symbol) [0x00007FF7173E0470]
	(No symbol) [0x00007FF71729B6EA]
	(No symbol) [0x00007FF7172F74EE]
	(No symbol) [0x00007FF7172F4F3C]
	(No symbol) [0x00007FF7172F2408]
	(No symbol) [0x00007FF7172F161A]
	(No symbol) [0x00007FF7172E36BE]
	(No symbol) [0x00007FF7173172FA]
	(No symbol) [0x00007FF7172E2FF6]
	(No symbol) [0x00007FF717317510]
	(No symbol) [0x00007FF7173386BC]
	(No symbol) [0x00007FF7173170A3]
	(No symbol) [0x00007FF7172E12DF]
	(No symbol) [0x00007FF7172E2441]
	GetHandleVerifier [0x000

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=129.0.6668.90)
Stacktrace:
	GetHandleVerifier [0x00007FF71746B645+29573]
	(No symbol) [0x00007FF7173E0470]
	(No symbol) [0x00007FF71729B6EA]
	(No symbol) [0x00007FF71726FCD5]
	(No symbol) [0x00007FF71731EF67]
	(No symbol) [0x00007FF717337FC1]
	(No symbol) [0x00007FF7173170A3]
	(No symbol) [0x00007FF7172E12DF]
	(No symbol) [0x00007FF7172E2441]
	GetHandleVerifier [0x00007FF71779C58D+3375821]
	GetHandleVerifier [0x00007FF7177E7987+3684039]
	GetHandleVerifier [0x00007FF7177DCDAB+3640043]
	GetHandleVerifier [0x00007FF71752B7C6+816390]
	(No symbol) [0x00007FF7173EB77F]
	(No symbol) [0x00007FF7173E75A4]
	(No symbol) [0x00007FF7173E7740]
	(No symbol) [0x00007FF7173D659F]
	BaseThreadInitThunk [0x00007FFEDBAC257D+29]
	RtlUserThreadStart [0x00007FFEDDA8AF28+40]
