# Setup Environment

In [2]:
import MySQLdb
from pandas.io import sql
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()

True

In [3]:
# Get the user and password from the .env file that we created
DB_USER=os.environ.get("DB_USER")
DB_PASS=os.environ.get("DB_PASS")
DB_HOST=os.environ.get("DB_HOST")
DB_NAME=os.environ.get("DB_NAME")

# Data Cleaning
Since the raw data is extremely large, we will filter for a specific business for faster processing and financial reasonsðŸ’°

In [26]:
df_businesses = pd.read_csv('./raw_data/yelp_reviews_with_business.csv')
df_categories = pd.read_csv('./raw_data/business_categories.csv') 

In [27]:
# There are leading/trailing spaces in the category column
df_categories['category'] = df_categories['category'].str.strip()

In [28]:
# Get the unique list of categories
unique_categories = df_categories['category'].unique().tolist()
restaurant_categories = [category for category in unique_categories if 'restaurant' in category.strip().lower()]

In [29]:
restaurant_categories

['Restaurants', 'Pop-Up Restaurants', 'Restaurant Supplies']

In [30]:
df_restaurants = df_categories[df_categories["category"]=="Restaurants"]

In [31]:
df_restaurants.head()

Unnamed: 0,business_id,category
17,MTSW4McQd7CbVtyjqoe9mw,Restaurants
30,CF33F8-E6oudUQ46HnavjQ,Restaurants
40,k0hlBqXX-Bt0vf1op7Jr1w,Restaurants
49,bBDDEgkFA1Otx9Lfe7BZUQ,Restaurants
56,eEOYSgkmpB90uNA7lDOMRA,Restaurants


In [32]:
restaurant_business_ids = df_restaurants['business_id'].unique()

In [33]:
df_restaurants = df_businesses[df_businesses["business_id"].isin(restaurant_business_ids)]

In [34]:
df_restaurants.shape

(836680, 22)

In [35]:
df_restaurants.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'review_id', 'user_id',
       'review_stars', 'useful', 'funny', 'cool', 'text', 'date'],
      dtype='object')

In [36]:
df_restaurants.groupby('business_id').size().reset_index(name='review_count').sort_values(by='review_count', ascending=False).reset_index(drop=True)

Unnamed: 0,business_id,review_count
0,ytynqOUb3hjKeJfRj5Tshw,5778
1,PP3BBaVxZLcJU54uP_wL6Q,4293
2,IkY2ticzHEn4QFn8hQLSWg,3428
3,9PZxjhTIU7OgPIzuGi89Ew,3264
4,ctHjyadbDQAtUFfkcAFEHw,3173
...,...,...
8064,ehmz5MfGLWSLr8vKL5LFbg,5
8065,2JJO0xpG5J93Ic8top5luQ,5
8066,X_VJmXGiV6NqPwCrqjwPyw,5
8067,XbFRkhVgX3jggGxfAaZyuA,5


In [37]:
top_restaurants = (
    df_restaurants
    .groupby('business_id')
    .size()
    .reset_index(name='review_count')
    .sort_values(by='review_count', ascending=False)
    .head()  # Get the top 5
)

In [38]:
top_restaurants = top_restaurants.reset_index(drop=True)

In [51]:
top_restaurants = top_restaurants.drop('review_count', axis = 1) 

In [52]:
# Merge with df_restaurants to get the reviews for the top restaurants
top_restaurant_reviews = pd.merge(top_restaurants, df_businesses, on='business_id', how='inner')

# Reset index for better readability
top_restaurant_reviews = top_restaurant_reviews.reset_index(drop=True)

In [53]:
top_restaurant_reviews.shape

(19936, 22)

In [54]:
top_restaurant_reviews["sentiment"] = top_restaurant_reviews["review_stars"].apply(lambda score: "positive" if score >= 3 else "negative")

In [55]:
top_restaurant_reviews['sentiment'].value_counts()

sentiment
positive    16331
negative     3605
Name: count, dtype: int64

In [56]:
# Let's focus on the business with the most ratings for this project, to keep it simple
df_market = top_restaurant_reviews[top_restaurant_reviews["business_id"]=="ytynqOUb3hjKeJfRj5Tshw"]

In [58]:
df_market.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'review_id', 'user_id',
       'review_stars', 'useful', 'funny', 'cool', 'text', 'date', 'sentiment'],
      dtype='object')

# Load into Database

In [14]:
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine

In [65]:
engine = create_engine(f'mysql+pymysql://{DB_USER}:{DB_PASS}@localhost:3306/yelp_db')

# Test connection by executing a simple query
query = """DESCRIBE market_reviews"""

pd.read_sql(query, engine)

Unnamed: 0,Field,Type,Null,Key,Default,Extra
0,business_id,varchar(255),NO,,,
1,name,varchar(255),YES,,,
2,address,varchar(255),YES,,,
3,city,varchar(100),YES,,,
4,state,varchar(50),YES,,,
5,postal_code,varchar(20),YES,,,
6,latitude,"decimal(10,7)",YES,,,
7,longitude,"decimal(10,7)",YES,,,
8,stars,"decimal(2,1)",YES,,,
9,review_count,int,YES,,,


In [66]:
df_market.to_sql('market_reviews', con=engine, if_exists='append', index=False)

5778

In [67]:
# Check table 
query = """SELECT * FROM market_reviews LIMIT 10"""

pd.read_sql(query, engine)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,hours,review_id,user_id,review_stars,useful,funny,cool,text,date,sentiment
0,ytynqOUb3hjKeJfRj5Tshw,Reading Terminal Market,51 N 12th St,Philadelphia,PA,19107,39.953342,-75.158855,4.5,5721,...,"{'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ...",kKC5pBPkUCWo6mKYFUewRw,mGnZFbk2gqLLtGW-mYo__A,5.0,0,0,0,I love this place! Doesn't take me long to get...,2016-08-25 16:30:53,positive
1,ytynqOUb3hjKeJfRj5Tshw,Reading Terminal Market,51 N 12th St,Philadelphia,PA,19107,39.953342,-75.158855,4.5,5721,...,"{'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ...",sFQrhTbTah0o2kU_Pi2D0Q,Tu4ATXLhy8kRTjpQCnl2pA,5.0,0,0,0,"One of my favorite places to go to in Philly, ...",2016-07-14 20:24:15,positive
2,ytynqOUb3hjKeJfRj5Tshw,Reading Terminal Market,51 N 12th St,Philadelphia,PA,19107,39.953342,-75.158855,4.5,5721,...,"{'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ...",kqn1uP3LRVjVDUD44ZSu1A,vRNb2IaGlsZRA_wUf3Ov8w,5.0,0,0,0,This might be a bit unfair to have a single re...,2017-04-07 22:27:22,positive
3,ytynqOUb3hjKeJfRj5Tshw,Reading Terminal Market,51 N 12th St,Philadelphia,PA,19107,39.953342,-75.158855,4.5,5721,...,"{'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ...",qMsTe9QznpNQk1AKbYLp-w,29K-usmZfVDeIaQ85EG54A,4.0,5,2,3,"Alright, I remember the first time I went to t...",2017-07-18 17:33:29,positive
4,ytynqOUb3hjKeJfRj5Tshw,Reading Terminal Market,51 N 12th St,Philadelphia,PA,19107,39.953342,-75.158855,4.5,5721,...,"{'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ...",nmMIRBNONIICe7CFHnfadQ,1jE--VcTddwXGampD23JCg,4.0,0,0,0,"It's an experience, to say the least! Not as ...",2013-08-03 20:18:22,positive
5,ytynqOUb3hjKeJfRj5Tshw,Reading Terminal Market,51 N 12th St,Philadelphia,PA,19107,39.953342,-75.158855,4.5,5721,...,"{'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ...",DgBsY-hNMTBWaXVlHVG7LA,-x7NKQ0qAcGaabJUDHm59A,4.0,0,0,0,Food selection is bountiful. Gets a bit crowde...,2017-02-19 17:17:57,positive
6,ytynqOUb3hjKeJfRj5Tshw,Reading Terminal Market,51 N 12th St,Philadelphia,PA,19107,39.953342,-75.158855,4.5,5721,...,"{'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ...",dXVhYlcX9X9kqVI16AoBHg,lavyYLh68LxIBhJdIE5f_g,5.0,0,0,0,"fresh seafood and produce. fresh breads, chees...",2015-08-19 10:21:14,positive
7,ytynqOUb3hjKeJfRj5Tshw,Reading Terminal Market,51 N 12th St,Philadelphia,PA,19107,39.953342,-75.158855,4.5,5721,...,"{'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ...",9CK6oJPgJEqa9pBCAhZxIg,5jMVOTXxWDoGx4iO9q2EPA,5.0,0,0,0,Reading terminal is always crowded. The food i...,2017-01-27 22:53:28,positive
8,ytynqOUb3hjKeJfRj5Tshw,Reading Terminal Market,51 N 12th St,Philadelphia,PA,19107,39.953342,-75.158855,4.5,5721,...,"{'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ...",Rab68s3xbKRyMgIVMQoXCw,3_4Y3BFXlhFiCiCJZTUV2g,5.0,0,0,0,Go hungry and early! Can eat anywhere in the ...,2018-05-28 22:20:00,positive
9,ytynqOUb3hjKeJfRj5Tshw,Reading Terminal Market,51 N 12th St,Philadelphia,PA,19107,39.953342,-75.158855,4.5,5721,...,"{'Monday': '8:0-18:0', 'Tuesday': '8:0-18:0', ...",WbaVLgs_sRJG4m2L6IuABQ,pIe5kiaHa94166xDuugQ1w,5.0,0,0,0,"This is a great spot for fresh local foods, ca...",2014-11-12 18:18:17,positive
