# Prelim Feature Engineering: Restaurant
Feature engineering done *before* splitting train/test. This is only for our viewing and analysis, should not be used in the formal modelling.

In [None]:
import pandas as pd

# Load metadata file (assuming tab-separated values)
metadata_path = '../00_dataset/YelpZip/metadata'  # Update with correct file path
metadata_cols = ["user_id", "prod_id", "rating", "label", "date"]
metadata = pd.read_csv(metadata_path, names=metadata_cols, sep="\t")

# Convert date column to datetime for calculations
metadata["date"] = pd.to_datetime(metadata["date"])

# Convert labels to boolean for efficient calculations
metadata["is_fake"] = metadata["label"] == -1

# Compute restaurant-based features
restaurant_review_counts = metadata.groupby("prod_id")["rating"].count().rename("total_reviews_for_restaurant")
restaurant_fake_counts = metadata.groupby("prod_id")["is_fake"].sum().rename("num_fake_reviews_for_restaurant")
restaurant_real_counts = (metadata.groupby("prod_id")["is_fake"].count() - restaurant_fake_counts).rename("num_real_reviews_for_restaurant")
restaurant_avg_rating = metadata.groupby("prod_id")["rating"].mean().rename("avg_rating_for_restaurant")
restaurant_rating_std = metadata.groupby("prod_id")["rating"].std().rename("std_dev_rating_for_restaurant")
restaurant_median_rating = metadata.groupby("prod_id")["rating"].median().rename("median_rating_for_restaurant")

# Compute review frequency per restaurant
restaurant_review_dates = metadata.groupby("prod_id")["date"].agg(["min", "max", "count"])

# Review frequency is calculated as (latest_review_date - earliest_review_date) / total_reviews
'''High Values (e.g., 30+ days per review) → LOW Activity
Means the restaurant gets infrequent reviews.
This is expected for small/local restaurants.
Not necessarily suspicious unless combined with high rating standard deviation.
2. Moderate Values (e.g., 3-15 days per review) → NORMAL Activity
Restaurants typically get a review every few days to a week.
Popular places should fall in this range.
3. Low Values (e.g., <1 day per review) → HIGH Activity
Means the restaurant is getting multiple reviews per day.
This could be organic (high foot traffic places like chains) or suspicious (fake reviews).
Suspicious if:
There is a sudden burst of reviews after inactivity.
A large percentage of reviews come from new users.
Many reviews have similar timestamps or wording.'''

restaurant_review_dates["review_frequency_for_restaurant"] = (restaurant_review_dates["max"] - restaurant_review_dates["min"]).dt.days / restaurant_review_dates["count"].clip(lower=2)
restaurant_review_dates = restaurant_review_dates["review_frequency_for_restaurant"]

# Compute unique and repeat reviewers count
'''All reviewers are unique, no reviewer reviewed the same restaurant twice'''
#unique_reviewers_count = metadata.groupby("prod_id")["user_id"].nunique().rename("unique_reviewers_count")
#repeat_reviewers_count = (metadata.groupby("prod_id")["user_id"].count() - unique_reviewers_count).rename("repeat_reviewers_count")

# Compute extreme (1-star, 5-star) and neutral (3-star) rating percentages.
# extreme_reviews = metadata[metadata["rating"].isin([1, 5])].groupby("prod_id")["rating"].count().rename("num_extreme_reviews")
# neutral_reviews = metadata[metadata["rating"] == 3].groupby("prod_id")["rating"].count().rename("num_neutral_reviews")
# total_reviews = metadata.groupby("prod_id")["rating"].count()
# percent_extreme_reviews = (extreme_reviews / total_reviews).rename("percent_extreme_reviews").fillna(0)
# percent_neutral_reviews = (neutral_reviews / total_reviews).rename("percent_neutral_reviews").fillna(0)


# Compute Extreme Rating Index
'''0 → All reviews are 3-star (perfectly neutral).
1 → Equal mix of 2-star, 3-star, and 4-star reviews.
2 → All reviews are either 1-star or 5-star (highly polarized).'''

metadata["rating_deviation"] = abs(metadata["rating"] - 3)  # Distance from neutral (3-star)
extreme_rating_index = metadata.groupby("prod_id")["rating_deviation"].mean().rename("extreme_rating_index")


# Merge computed features
restaurant_features = metadata[["prod_id"]].drop_duplicates()
restaurant_features = restaurant_features.merge(restaurant_review_counts, on="prod_id", how="left")
restaurant_features = restaurant_features.merge(restaurant_fake_counts, on="prod_id", how="left")
restaurant_features = restaurant_features.merge(restaurant_real_counts, on="prod_id", how="left")
restaurant_features = restaurant_features.merge(restaurant_avg_rating, on="prod_id", how="left")
restaurant_features = restaurant_features.merge(restaurant_rating_std, on="prod_id", how="left")
restaurant_features = restaurant_features.merge(restaurant_median_rating, on="prod_id", how="left")
restaurant_features = restaurant_features.merge(restaurant_review_dates, on="prod_id", how="left")
#restaurant_features = restaurant_features.merge(unique_reviewers_count, on="prod_id", how="left")
#restaurant_features = restaurant_features.merge(repeat_reviewers_count, on="prod_id", how="left")
# restaurant_features = restaurant_features.merge(percent_extreme_reviews, on="prod_id", how="left")
# restaurant_features = restaurant_features.merge(percent_neutral_reviews, on="prod_id", how="left")
restaurant_features = restaurant_features.merge(extreme_rating_index, on="prod_id", how="left")


# Fill NaN values for standard deviation (caused by single reviews) with 0
restaurant_features["std_dev_rating_for_restaurant"].fillna(0, inplace=True)

# Save the processed dataset
restaurant_features.to_csv("processed_restaurant_features_2.csv", index=False)

# Display first few rows
restaurant_features.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  restaurant_features["std_dev_rating_for_restaurant"].fillna(0, inplace=True)


Unnamed: 0,prod_id,total_reviews_for_restaurant,num_fake_reviews_for_restaurant,num_real_reviews_for_restaurant,avg_rating_for_restaurant,std_dev_rating_for_restaurant,median_rating_for_restaurant,review_frequency_for_restaurant,extreme_rating_index
0,0,88,7,81,3.613636,1.316839,4.0,7.511364,1.272727
1,1,25,11,14,2.64,1.577973,2.0,101.96,1.48
2,2,33,6,27,4.030303,1.211529,4.0,72.060606,1.454545
3,2780,6,0,6,5.0,0.0,5.0,32.0,2.0
4,4,75,17,58,3.68,1.209869,4.0,37.533333,1.16
