In [13]:
import pandas as pd
import json
from pathlib import Path

In [14]:
business_df = pd.read_json(r"../data/raw/yelp_academic_dataset_business.json", lines=True)

In [15]:
# Keep only businesses with "Restaurant" or "Food" in categories
business_df = business_df[business_df["categories"].str.contains("Restaurants | Food", na=False)]
# Select relevant fields
business_df = business_df[["business_id", "name", "city", "address", "state", "categories"]]

In [16]:
# For simplicity, limit to first 100,000 reviews (adjust as needed)
review_data = []
with open(r"../data/raw/yelp_academic_dataset_review.json", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 100000:  # Limit for demo; remove for full dataset
            break
        review_data.append(json.loads(line))

# Convert to DataFrame and select relevant fields
review_df = pd.DataFrame(review_data)
review_df = review_df[["business_id", "stars", "text"]]

In [17]:
merged_df = pd.merge(business_df, review_df, on="business_id", how="inner")

In [11]:
merged_df.head()

Unnamed: 0,business_id,name,city,address,state,categories,stars,text
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,935 Race St,PA,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",4.0,This is nice little Chinese bakery in the hear...
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,935 Race St,PA,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",4.0,This is the bakery I usually go to in Chinatow...
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,935 Race St,PA,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",5.0,"A delightful find in Chinatown! Very clean, an..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,935 Race St,PA,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",5.0,I ordered a graduation cake for my niece and i...
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,935 Race St,PA,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",4.0,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...


In [18]:
merged_df = merged_df.rename(columns={
    "name": "restaurant_name",
    "text": "review",
    "stars": "rating"
})

In [13]:
merged_df.head()

Unnamed: 0,business_id,restaurant_name,city,address,state,categories,rating,review
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,935 Race St,PA,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",4.0,This is nice little Chinese bakery in the hear...
1,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,935 Race St,PA,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",4.0,This is the bakery I usually go to in Chinatow...
2,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,935 Race St,PA,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",5.0,"A delightful find in Chinatown! Very clean, an..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,935 Race St,PA,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",5.0,I ordered a graduation cake for my niece and i...
4,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,935 Race St,PA,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",4.0,HK-STYLE MILK TEA: FOUR STARS\n\nNot quite su...


In [14]:
merged_df.describe

<bound method NDFrame.describe of                   business_id               restaurant_name          city  \
0      MTSW4McQd7CbVtyjqoe9mw            St Honore Pastries  Philadelphia   
1      MTSW4McQd7CbVtyjqoe9mw            St Honore Pastries  Philadelphia   
2      MTSW4McQd7CbVtyjqoe9mw            St Honore Pastries  Philadelphia   
3      MTSW4McQd7CbVtyjqoe9mw            St Honore Pastries  Philadelphia   
4      MTSW4McQd7CbVtyjqoe9mw            St Honore Pastries  Philadelphia   
...                       ...                           ...           ...   
28444  TCROPjxfzCZzrQjqLqstSg  Red Cup Cafe & Hookah Lounge  Philadelphia   
28445  TCROPjxfzCZzrQjqLqstSg  Red Cup Cafe & Hookah Lounge  Philadelphia   
28446  TCROPjxfzCZzrQjqLqstSg  Red Cup Cafe & Hookah Lounge  Philadelphia   
28447  TCROPjxfzCZzrQjqLqstSg  Red Cup Cafe & Hookah Lounge  Philadelphia   
28448  TCROPjxfzCZzrQjqLqstSg  Red Cup Cafe & Hookah Lounge  Philadelphia   

                 address state  \
0      

In [19]:
# Function to derive rating_category
def get_rating_category(rating):
    if rating >= 4:
        return "Positive"
    elif rating == 3:
        return "Neutral"
    else:
        return "Negative"

In [20]:
# Create rating_category
merged_df["rating_category"] = merged_df["rating"].apply(get_rating_category)

In [21]:
# Combine city and state for location
# merged_df["location"] = merged_df["city"] + ", " + merged_df["state"]
merged_df["location"] = merged_df[["city", "state", "address"]].agg(", ".join, axis=1)
# merged_df["review"] = merged_df[["review", "categories"]].agg(", ".join, axis=1)

In [22]:
# Select final columns
final_df = merged_df[["restaurant_name", "review", "rating", "rating_category", "location"]]

In [23]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28449 entries, 0 to 28448
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   restaurant_name  28449 non-null  object 
 1   review           28449 non-null  object 
 2   rating           28449 non-null  float64
 3   rating_category  28449 non-null  object 
 4   location         28449 non-null  object 
dtypes: float64(1), object(4)
memory usage: 1.1+ MB


In [24]:
final_df["review"][0]

"This is nice little Chinese bakery in the heart of Philadelphia's Chinatown! The female cashier was very friendly (flirtatious!) and the pastries shown in nicely adorned display cases. I stopped by early one evening had a sesame ball, which was filled with bean paste. The glutinous rice of the ball was nicely flavored, similar to Bai Tang Gao. Definitely as place worth stopping at if you are in the area."

In [25]:
# Save to CSV
output_path = Path(r"../data/final")
output_path.mkdir(parents=True, exist_ok=True)
final_df.to_csv(r"../data/final/final_restaurant_reviews.csv" , index=False)