# Setup Environment

In [2]:
from MySQLdb import _mysql
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()

True

In [5]:
# Get the user and password from the .env file that we created
DB_USER=os.environ.get("DB_USER")
DB_PASS=os.environ.get("DB_PASS")

## Connect to DB
Then Testing Connection

In [7]:
db = _mysql.connect(
    host="localhost",
    user=DB_USER,
    password=DB_PASS,
    database="yelp_db"
)

In [9]:
db.query("SHOW TABLES")

In [10]:
# Fetch the result
result = db.store_result()

In [11]:
# Get all tables from the result
tables = result.fetch_row(maxrows=0)

# Print the tables
for table in tables:
    print(table[0])

b'business_categories'
b'yelp_reviews_with_business'


# Data Cleaning
Since the raw data is extremely large, we will filter for a specific business for faster processing and financial reasons💰

In [11]:
df_businesses = pd.read_csv('./raw_data/yelp_reviews_with_business.csv')

In [4]:
df_categories = pd.read_csv('./raw_data/business_categories.csv') 

In [26]:
# There are leading/trailing spaces in the category column
df_categories['category'] = df_categories['category'].str.strip()

In [27]:
unique_categories = df_categories['category'].unique().tolist()
restaurant_categories = [category for category in unique_categories if 'restaurant' in category.strip().lower()]

In [28]:
restaurant_categories

['Restaurants', 'Pop-Up Restaurants', 'Restaurant Supplies']

In [30]:
df_restaurants = df_categories[df_categories["category"]=="Restaurants"]

In [32]:
df_restaurants.head()

Unnamed: 0,business_id,category
17,MTSW4McQd7CbVtyjqoe9mw,Restaurants
30,CF33F8-E6oudUQ46HnavjQ,Restaurants
40,k0hlBqXX-Bt0vf1op7Jr1w,Restaurants
49,bBDDEgkFA1Otx9Lfe7BZUQ,Restaurants
56,eEOYSgkmpB90uNA7lDOMRA,Restaurants


In [33]:
restaurant_business_ids = df_restaurants['business_id'].unique()

In [34]:
df_restaurants = df_businesses[df_businesses["business_id"].isin(restaurant_business_ids)]

In [35]:
df_restaurants.shape

(836680, 22)

In [36]:
df_restaurants.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'review_id', 'user_id',
       'review_stars', 'useful', 'funny', 'cool', 'text', 'date'],
      dtype='object')

In [46]:
df_restaurants.groupby('business_id').size().reset_index(name='review_count').sort_values(by='review_count', ascending=False).reset_index(drop=True)

Unnamed: 0,business_id,review_count
0,ytynqOUb3hjKeJfRj5Tshw,5778
1,PP3BBaVxZLcJU54uP_wL6Q,4293
2,IkY2ticzHEn4QFn8hQLSWg,3428
3,9PZxjhTIU7OgPIzuGi89Ew,3264
4,ctHjyadbDQAtUFfkcAFEHw,3173
...,...,...
8064,ehmz5MfGLWSLr8vKL5LFbg,5
8065,2JJO0xpG5J93Ic8top5luQ,5
8066,X_VJmXGiV6NqPwCrqjwPyw,5
8067,XbFRkhVgX3jggGxfAaZyuA,5


In [47]:
top_restaurants = (
    df_restaurants
    .groupby('business_id')
    .size()
    .reset_index(name='review_count')
    .sort_values(by='review_count', ascending=False)
    .head(100)  # Get the top 100
)

In [48]:
top_restaurants = top_restaurants.reset_index(drop=True)

In [51]:
top_restaurants

Unnamed: 0,business_id,review_count
0,ytynqOUb3hjKeJfRj5Tshw,5778
1,PP3BBaVxZLcJU54uP_wL6Q,4293
2,IkY2ticzHEn4QFn8hQLSWg,3428
3,9PZxjhTIU7OgPIzuGi89Ew,3264
4,ctHjyadbDQAtUFfkcAFEHw,3173
...,...,...
95,qxRTFagnexBZgDe055CjkQ,798
96,6zEWIsb6Lhr3BeoC3gm1lw,798
97,8j3blTZChklt3j89jxx0fw,791
98,AaTpjyw-EiODgi3tR4Xr-g,785
