In [None]:
import os
import pandas as pd

## Download Yelp Dataset


Download the [Yelp Dataset](https://www.yelp.com/dataset), untar the data and store it under `./data` directory.


In [None]:
data_dir = "./data"

businesses_fname = "yelp_academic_dataset_business.json"
reviews_fname = "yelp_academic_dataset_review.json"

## Yelp Businesses


In [None]:
businesses = pd.read_json(os.path.join(data_dir, businesses_fname), lines=True)
businesses = businesses[["business_id", "stars", "review_count", "categories"]]
businesses.head()

In [None]:
businesses.shape

In [None]:
businesses.isnull().sum()

In [None]:
businesses.dropna(inplace=True)
businesses.shape

In [None]:
businesses[["stars"]].describe()

In [None]:
businesses[["review_count"]].describe()

## Preprocess Yelp Businesses


Reviews from businesses with low ratings or low review counts may include noise, outliers, or uninformative content. So, we will filter out such businesses to ensure high data quality.


In [None]:
min_review_count = 15
min_stars = 3.0

filtered_businesses = businesses[
    (businesses["review_count"] >= min_review_count)
    & (businesses["stars"] >= min_stars)
]

In [None]:
filtered_businesses.shape

In [None]:
filtered_businesses[["stars"]].describe()

In [None]:
filtered_businesses[["review_count"]].describe()

## Yelp Reviews


In [None]:
reviews = pd.read_json(os.path.join(data_dir, reviews_fname), lines=True)
reviews = reviews[["review_id", "business_id", "stars", "text"]]
reviews.head()

In [None]:
reviews.shape

In [None]:
reviews.isnull().sum()

In [None]:
reviews[["stars"]].describe()

In [None]:
# Review text length stats
reviews["text"].apply(len).describe()

In [None]:
# Review # words stats
reviews["text"].apply(lambda x: len(x.split())).describe()

## Preprocess Reviews


Now filter out excessively long or short reviews, including ones with low ratings.


In [None]:
min_stars = 3.0
filtered_reviews = reviews[
    (reviews["stars"] >= min_stars)
    & (
        reviews["text"].apply(
            lambda x: (len(x.split()) >= 42 and len(x.split()) <= 149)
        )
    )
]

In [None]:
filtered_reviews.shape

In [None]:
filtered_reviews[["stars"]].describe()

In [None]:
# Review # words stats
filtered_reviews["text"].apply(lambda x: len(x.split())).describe()

## Group Businesses & Reviews


In [None]:
# Now merge and filter out reviews from businesses that are not part of our list.
merged_df = pd.merge(
    filtered_reviews, filtered_businesses, on="business_id", how="inner"
)
merged_df.drop(columns=["review_count", "categories"], inplace=True)
merged_df.rename(
    columns={"stars_x": "review_stars", "stars_y": "business_stars"},
    inplace=True,
)
merged_df.head()

In [None]:
merged_df.shape

In [None]:
merged_df["business_id"].nunique()

In [None]:
merged_df.groupby(["business_id"])["business_id"].count().reset_index(
    name="count"
).describe()

In [None]:
# Now group by business_id and filter out businesses that less # of reviews.
# Need to do this again because we already filtered a lot of reviews,
# so review_count for each businesses might be different now.
min_review_count = 15

yelp_dataset = merged_df.groupby(["business_id"]).filter(
    lambda x: len(x) >= min_review_count
)
yelp_dataset.head()

In [None]:
yelp_dataset.shape

In [None]:
yelp_dataset.groupby(["business_id"])["business_id"].count().reset_index(
    name="count"
).describe()

In [None]:
yelp_dataset[["business_stars"]].describe()

In [None]:
yelp_dataset[["review_stars"]].describe()

In [None]:
# Review # words stats
yelp_dataset["text"].apply(lambda x: len(x.split())).describe()