In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import csv

In [2]:
# Create a base URL
base_url = 'https://www.yelp.com/biz/brennans-of-houston-houston?osq=Brennans+Restaurant&sort_by=date_desc'

In [3]:
def generate_yelp_urls(base_url, start, increment, max_reviews):
    """
    Function to generate a list of urls to pull reviews from.
    
    Parameters:
    base_url (str): String containing the base url.
    start (int): Integer to determine starting point.
    increment (int): Integer representing number of reviews per page.
    max_reviews (int): Maximum number of reviews to pull.

    Returns:
    urls (list): List of urls.
    """
    # Initialize list to contain urls
    urls = []

    # Loop to generate urls
    for i in range(0, max_reviews, increment):
        # Insert the start parameter before the sort_by parameter
        url = f"{base_url}&start={i}&sort_by=date_desc"
        urls.append(url)
    return urls

In [4]:
# Define increment
increment = 10
# Define max_reviews
max_reviews = 100

# Call the function to generate a list of urls
urls = generate_yelp_urls(base_url, start=0, increment=increment, max_reviews=max_reviews)
for url in urls:
    print(url)

https://www.yelp.com/biz/brennans-of-houston-houston?osq=Brennans+Restaurant&sort_by=date_desc&start=0&sort_by=date_desc
https://www.yelp.com/biz/brennans-of-houston-houston?osq=Brennans+Restaurant&sort_by=date_desc&start=10&sort_by=date_desc
https://www.yelp.com/biz/brennans-of-houston-houston?osq=Brennans+Restaurant&sort_by=date_desc&start=20&sort_by=date_desc
https://www.yelp.com/biz/brennans-of-houston-houston?osq=Brennans+Restaurant&sort_by=date_desc&start=30&sort_by=date_desc
https://www.yelp.com/biz/brennans-of-houston-houston?osq=Brennans+Restaurant&sort_by=date_desc&start=40&sort_by=date_desc
https://www.yelp.com/biz/brennans-of-houston-houston?osq=Brennans+Restaurant&sort_by=date_desc&start=50&sort_by=date_desc
https://www.yelp.com/biz/brennans-of-houston-houston?osq=Brennans+Restaurant&sort_by=date_desc&start=60&sort_by=date_desc
https://www.yelp.com/biz/brennans-of-houston-houston?osq=Brennans+Restaurant&sort_by=date_desc&start=70&sort_by=date_desc
https://www.yelp.com/biz/

In [5]:
# Initialize list to store review data
all_reviews_data = []

# Loop through urls to get multiple pages of reviews
for url in urls:
    # Send a get request to fetch the webpage content
    response = requests.get(url)
    webpage = response.content

    # Parse the webpage content with BeautifulSoup
    soup = BeautifulSoup(webpage, 'html.parser')

    # Find the <ul> containing all reviews
    ul_tags = soup.find_all('ul', class_='list__09f24__ynIEd')

    # Loop through <ul> tags to retrieve correct data
    for ul in ul_tags:
        review_items = ul.find_all('li', class_='y-css-1jp2syp')
        review_items = review_items[:-1] # The last <li> tag contains no review information

        if review_items:
            # Loop through each individual review within the <ul>
            for review in review_items:
                print('Review Found')
                # Extract the reviewer's name
                name_tag = review.find('a', class_='y-css-12ly5yx')
                name = name_tag.get_text() if name_tag else 'N/A'
        
                # Extract the reviewer's location
                location_tag = review.find('span', class_='y-css-h9c2fl')
                location = location_tag.get_text() if location_tag else 'N/A'
        
                # Extract the review rating
                rating_tag = review.find('div', class_='y-css-9tnml4')
                rating = rating_tag['aria-label'] if rating_tag else 'N/A'
        
                # Extract the review date
                date_tag = review.find('span', class_='y-css-wfbtsu')
                date = date_tag.get_text() if date_tag else 'N/A'
        
                # Extract the review text
                text_tag = review.find('span', class_='raw__09f24__T4Ezm')
                text = text_tag.get_text() if text_tag else 'N/A'
                    
                # Append the extracted data to the list
                all_reviews_data.append([name, location, rating, date, text])

# Convert the list of reviews into a DataFrame
df = pd.DataFrame(all_reviews_data, columns=['Name', 'Location', 'Rating', 'Date', 'Text'])

print("Scraping complete! Data saved to DataFrame")

Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found
Review Found

In [6]:
df.head()

Unnamed: 0,Name,Location,Rating,Date,Text
0,,,,,
1,Laura W.,"Houston, TX",5 star rating,"Aug 19, 2024",I share a birthday with a friend and Brennan's...
2,Simone P.,"Edgehill, Nashville, TN",2 star rating,"Aug 17, 2024",I went for Houston restaurant week and ordered...
3,S P.,"Bellaire, TX",3 star rating,"Aug 17, 2024",We are here before seeing Hairspray at Hobby t...
4,Michelle W.,"Houston, TX",3 star rating,"Aug 16, 2024","I hate to say it, but their brunch HRW menu di..."


In [7]:
# Look into N/A values
na_rows = df[df['Name'] == 'N/A']

print(na_rows)

   Name Location Rating Date Text
0   N/A      N/A    N/A  N/A  N/A
11  N/A      N/A    N/A  N/A  N/A
22  N/A      N/A    N/A  N/A  N/A
33  N/A      N/A    N/A  N/A  N/A
44  N/A      N/A    N/A  N/A  N/A
55  N/A      N/A    N/A  N/A  N/A
66  N/A      N/A    N/A  N/A  N/A
77  N/A      N/A    N/A  N/A  N/A
88  N/A      N/A    N/A  N/A  N/A
99  N/A      N/A    N/A  N/A  N/A


In [8]:
# Drop rows containing N/A values
df_cleaned = df[(df != 'N/A').all(axis=1)]

df_cleaned.head()

Unnamed: 0,Name,Location,Rating,Date,Text
1,Laura W.,"Houston, TX",5 star rating,"Aug 19, 2024",I share a birthday with a friend and Brennan's...
2,Simone P.,"Edgehill, Nashville, TN",2 star rating,"Aug 17, 2024",I went for Houston restaurant week and ordered...
3,S P.,"Bellaire, TX",3 star rating,"Aug 17, 2024",We are here before seeing Hairspray at Hobby t...
4,Michelle W.,"Houston, TX",3 star rating,"Aug 16, 2024","I hate to say it, but their brunch HRW menu di..."
5,Anthony D.,"San Francisco, CA",5 star rating,"Aug 11, 2024",We made reservations for the Brennan's Kitche...


In [9]:
# Verify N/A values have been dropped
na_rows_updated = df_cleaned[df_cleaned['Name'] == 'N/A']

print(na_rows_updated)

Empty DataFrame
Columns: [Name, Location, Rating, Date, Text]
Index: []


In [10]:
# Save the DataFrame to a csv file
df_cleaned.to_csv('yelp_reviews.csv', index=False, encoding='utf-8')

print("DataFrame saved to cleaned_yelp_reviews.csv")

DataFrame saved to cleaned_yelp_reviews.csv
