In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Pre-process dataset

In [None]:
df = pd.read_csv('../00_dataset/YelpZip/metadata', 
                 sep='\t',
                 header=None,
                 names=["user_id", "prod_id", "rating", "label", "date"])
df

Unnamed: 0,user_id,prod_id,rating,label,date
0,5044,0,1.0,-1,2014-11-16
1,5045,0,1.0,-1,2014-09-08
2,5046,0,3.0,-1,2013-10-06
3,5047,0,5.0,-1,2014-11-30
4,5048,0,5.0,-1,2014-08-28
...,...,...,...,...,...
608593,119664,5039,4.0,1,2013-01-20
608594,56277,5039,2.0,1,2012-11-12
608595,265320,5039,1.0,1,2012-08-22
608596,161722,5039,4.0,1,2011-05-11


In [29]:
# Equalise 1 and -1 labels, only 8000 rows per label
positive_df = df[(df['label'] == 1)][:8000]
negative_df = df[(df['label'] == -1)][:8000]
dataset_df = pd.concat([positive_df, negative_df])
dataset_df

Unnamed: 0,user_id,prod_id,rating,label,date
7,5051,0,1.0,1,2014-12-05
8,5052,0,2.0,1,2014-11-26
9,5053,0,4.0,1,2014-11-17
10,5054,0,3.0,1,2014-10-20
11,5055,0,5.0,1,2014-10-13
...,...,...,...,...,...
55969,48392,496,5.0,-1,2008-06-18
55970,48393,496,5.0,-1,2008-06-11
55971,48394,496,5.0,-1,2008-03-28
55972,48395,496,5.0,-1,2008-03-22


# Create Train-test split

In [30]:
y = dataset_df['label']
X = dataset_df.drop('label', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Aggregator Functions

In [31]:
def UserFeatureAggregator(X_data):
    df = X_data.copy()

    # Ensure 'date' column is in datetime format
    df['date'] = pd.to_datetime(df['date'])

    # Aggregate user features
    user_features = df.groupby('user_id').agg({
        'prod_id': 'count',  # no. of restaurant reviews per user
        'rating': ['mean', 'min', 'max', 'std'],  # rating statistics
        'date': ['min', 'max']  # First and last review dates
    })

    # std will be NA for people with only 1 review, so fill it with reviews.
    user_features[('rating', 'std')] = user_features[('rating', 'std')].fillna(0)

    # unique days active (no. of days the user has made a rating/review)
    user_activity = df.groupby('user_id')['date'].nunique()

    user_features[('unique_days_active', '')] = user_activity

    # Calculate review timespan
    user_features['review_timespan'] = (user_features[('date', 'max')] - user_features[('date', 'min')]).dt.days

    # Avoid division by zero for users with only one review
    user_features['review_timespan'] = user_features['review_timespan'].replace(0, 1)

    # Compute average reviews per day
    user_features['avg_reviews_per_day'] = user_features[('prod_id', 'count')] / user_features['review_timespan']

    # Compute percentage of active days against user existence date
    user_features['user_active_percentage'] = user_features['unique_days_active'] / user_features['review_timespan']

    # round floats to 3dp
    user_features = user_features.round(3)

    # flatten df
    user_features.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col for col in user_features.columns]
    user_features.fillna(0)

    # Rename columns
    user_features.rename(columns={
        'prod_id_count': 'user_restaurants_reviewed', 
        'date_min': 'user_earliest', 
        'date_max': 'user_latest',
        'review_timespan': 'user_review_timespan',
        'unique_days_active': 'user_days_active',
        'avg_reviews_per_day': 'users_avg_per_day'
    }, inplace=True)

    user_features['user_earliest'] = user_features['user_earliest'].astype('int64') // 10**9
    user_features['user_latest'] = user_features['user_latest'].astype('int64') // 10**9
    return user_features.reset_index()

# Function for a new user
def UpdateUserDetails(train_features, test_X):
    # Merge on User ID data, if any users already exist in the trainset
    new_X = test_X.merge(train_features, on="user_id", how="left")

    # Ensure 'date' is in datetime format
    new_X['date'] = pd.to_datetime(new_X['date'], errors='coerce')

    # Convert to UNIX timestamp correctly
    new_X['date'] = new_X['date'].apply(lambda x: int(x.timestamp()) if pd.notnull(x) else 0)

    # Define default values if there are users that do not exist in the train dataset
    new_X.fillna({
        "rating_mean": 3.0,
        "rating_min": 3.0,
        "rating_max": 3.0,
        "rating_std": 0.0,
        "user_earliest": new_X['date'],
        "user_latest": new_X['date'],
        "user_days_active": 0,
        "user_review_timespan": 1,
        "users_avg_per_day": 0.0,
        "user_active_percentage": 0.0,
        "user_restaurants_reviewed": 0,
    }, inplace=True)
    int_cols = ["user_earliest", "user_latest", "user_restaurants_reviewed", "user_review_timespan", "user_days_active"]
    new_X[int_cols] = new_X[int_cols].astype(int)
    return new_X

In [32]:
def RestaurantFeatureAggregator(X_data):
    df = X_data.copy()
    # Convert date column to datetime for calculations
    df["date"] = pd.to_datetime(df["date"])

    # Compute restaurant-based features
    restaurant_review_counts = df.groupby("prod_id")["rating"].count().rename("total_reviews_for_restaurant")
    restaurant_avg_rating = df.groupby("prod_id")["rating"].mean().rename("avg_rating_for_restaurant")
    restaurant_rating_std = df.groupby("prod_id")["rating"].std().rename("std_dev_rating_for_restaurant")
    restaurant_median_rating = df.groupby("prod_id")["rating"].median().rename("median_rating_for_restaurant")

    # Compute review frequency per restaurant
    restaurant_review_dates = df.groupby("prod_id")["date"].agg(["min", "max", "count"])

    # Review frequency is calculated as (latest_review_date - earliest_review_date) / total_reviews
    '''High Values (e.g., 30+ days per review) → LOW Activity
    Means the restaurant gets infrequent reviews.
    This is expected for small/local restaurants.
    Not necessarily suspicious unless combined with high rating standard deviation.
    2. Moderate Values (e.g., 3-15 days per review) → NORMAL Activity
    Restaurants typically get a review every few days to a week.
    Popular places should fall in this range.
    3. Low Values (e.g., <1 day per review) → HIGH Activity
    Means the restaurant is getting multiple reviews per day.
    This could be organic (high foot traffic places like chains) or suspicious (fake reviews).
    Suspicious if:
    There is a sudden burst of reviews after inactivity.
    A large percentage of reviews come from new users.
    Many reviews have similar timestamps or wording.'''

    restaurant_review_dates["review_frequency_for_restaurant"] = (restaurant_review_dates["max"] - restaurant_review_dates["min"]).dt.days / restaurant_review_dates["count"].clip(lower=2)
    restaurant_review_dates = restaurant_review_dates["review_frequency_for_restaurant"]

    # Compute unique and repeat reviewers count
    '''All reviewers are unique, no reviewer reviewed the same restaurant twice'''

    # Compute Extreme Rating Index
    '''0 → All reviews are 3-star (perfectly neutral).
    1 → Equal mix of 2-star, 3-star, and 4-star reviews.
    2 → All reviews are either 1-star or 5-star (highly polarized).'''

    df["rating_deviation"] = abs(df["rating"] - 3)  # Distance from neutral (3-star)
    extreme_rating_index = df.groupby("prod_id")["rating_deviation"].mean().rename("extreme_rating_index")


    # Merge computed features
    restaurant_features = df[["prod_id"]].drop_duplicates()
    restaurant_features = restaurant_features.merge(restaurant_review_counts, on="prod_id", how="left")
    restaurant_features = restaurant_features.merge(restaurant_avg_rating, on="prod_id", how="left")
    restaurant_features = restaurant_features.merge(restaurant_rating_std, on="prod_id", how="left")
    restaurant_features = restaurant_features.merge(restaurant_median_rating, on="prod_id", how="left")
    restaurant_features = restaurant_features.merge(restaurant_review_dates, on="prod_id", how="left")
    restaurant_features = restaurant_features.merge(extreme_rating_index, on="prod_id", how="left")

    # Fill NaN values for standard deviation (caused by single reviews) with 0
    restaurant_features["std_dev_rating_for_restaurant"] = restaurant_features["std_dev_rating_for_restaurant"].fillna(0)
    return pd.DataFrame(restaurant_features)

# Function for a new user
def UpdateRestaurantDetails(train_features, test_X):
    # Merge on restaurant data, existing restaurant info from train set will be merged
    new_X = test_X.merge(train_features, on="prod_id", how="left")

    # Define default values if there are restaurants that do not exist in the train dataset
    new_X.fillna({
        "total_reviews_for_restaurant": 0,
        "avg_rating_for_restaurant": train_features["avg_rating_for_restaurant"].median(),
        "std_dev_rating_for_restaurant": train_features["std_dev_rating_for_restaurant"].median(),
        "median_rating_for_restaurant": train_features["median_rating_for_restaurant"].median(),
        "review_frequency_for_restaurant": 0,
        "extreme_rating_index": train_features["extreme_rating_index"].median(),
        # "rating_deviation": train_features["rating_deviation"].median()
    }, inplace=True)
    
    int_cols = ["total_reviews_for_restaurant"]
    new_X[int_cols] = new_X[int_cols].astype(int)
    return new_X

# Apply feature engineering to train set

In [33]:
# Create the features and merge them with original dataset
res_features_train = RestaurantFeatureAggregator(X_train)
user_features_train = UserFeatureAggregator(X_train)
new_X_train = X_train.merge(res_features_train, on='prod_id', how='left').merge(user_features_train, on='user_id', how='left')

# Convert date to seconds
new_X_train['date'] = new_X_train['date'].apply(lambda x: int(pd.to_datetime(x).timestamp()))
new_X_train

Unnamed: 0,user_id,prod_id,rating,date,total_reviews_for_restaurant,avg_rating_for_restaurant,std_dev_rating_for_restaurant,median_rating_for_restaurant,review_frequency_for_restaurant,extreme_rating_index,...,rating_mean,rating_min,rating_max,rating_std,user_earliest,user_latest,user_days_active,user_review_timespan,users_avg_per_day,user_active_percentage
0,43559,436,1.0,1299456000,15,3.933333,1.437591,4.0,133.933333,1.600000,...,1.0,1.0,1.0,0.0,1299456000,1299456000,1,1,1.0,1.0
1,27337,188,5.0,1290902400,6,2.333333,1.751190,1.5,247.833333,1.666667,...,5.0,5.0,5.0,0.0,1290902400,1290902400,1,1,1.0,1.0
2,43058,426,5.0,1235174400,31,4.193548,1.166743,5.0,61.322581,1.516129,...,5.0,5.0,5.0,0.0,1235174400,1235174400,1,1,1.0,1.0
3,28287,200,5.0,1285632000,19,4.368421,1.256562,5.0,78.526316,1.789474,...,5.0,5.0,5.0,0.0,1285632000,1285632000,1,1,1.0,1.0
4,33253,280,1.0,1361577600,53,3.339623,1.372020,4.0,49.547170,1.207547,...,1.0,1.0,1.0,0.0,1361577600,1361577600,1,1,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11195,37049,349,4.0,1409788800,27,4.222222,1.339728,5.0,82.185185,1.740741,...,4.0,4.0,4.0,0.0,1409788800,1409788800,1,1,1.0,1.0
11196,11307,2694,5.0,1213660800,44,4.250000,0.838742,4.0,54.477273,1.386364,...,5.0,5.0,5.0,0.0,1213660800,1213660800,1,1,1.0,1.0
11197,6520,9,3.0,1405123200,1740,2.505172,1.318893,2.0,2.020690,1.187931,...,3.0,3.0,3.0,0.0,1405123200,1405123200,1,1,1.0,1.0
11198,48218,496,4.0,1366934400,210,4.057143,1.184659,4.0,12.285714,1.447619,...,4.0,4.0,4.0,0.0,1366934400,1366934400,1,1,1.0,1.0


# Apply feature engineering to test set

In [34]:
new_X_test = UpdateRestaurantDetails(res_features_train, X_test)
new_X_test = UpdateUserDetails(user_features_train, new_X_test)
new_X_test

Unnamed: 0,user_id,prod_id,rating,date,total_reviews_for_restaurant,avg_rating_for_restaurant,std_dev_rating_for_restaurant,median_rating_for_restaurant,review_frequency_for_restaurant,extreme_rating_index,...,rating_mean,rating_min,rating_max,rating_std,user_earliest,user_latest,user_days_active,user_review_timespan,users_avg_per_day,user_active_percentage
0,9136,28,1.0,1313193600,551,4.123412,1.021236,4.0,6.399274,1.348457,...,3.0,3.0,3.0,0.0,1313193600,1313193600,0,1,0.0,0.0
1,10546,43,4.0,1386633600,148,3.898649,1.080060,4.0,19.148649,1.222973,...,3.0,3.0,3.0,0.0,1386633600,1386633600,0,1,0.0,0.0
2,11998,57,5.0,1278892800,688,3.614826,1.018696,4.0,4.222384,0.981105,...,3.0,3.0,3.0,0.0,1278892800,1278892800,0,1,0.0,0.0
3,5428,7,5.0,1393718400,350,4.211429,0.930776,4.0,4.237143,1.377143,...,3.0,3.0,3.0,0.0,1393718400,1393718400,0,1,0.0,0.0
4,5954,9,1.0,1286150400,1740,2.505172,1.318893,2.0,2.020690,1.187931,...,3.0,3.0,3.0,0.0,1286150400,1286150400,0,1,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4795,31000,4341,5.0,1382832000,11,2.545455,1.752920,2.0,93.545455,1.545455,...,3.0,3.0,3.0,0.0,1382832000,1382832000,0,1,0.0,0.0
4796,48244,496,4.0,1305676800,210,4.057143,1.184659,4.0,12.285714,1.447619,...,3.0,3.0,3.0,0.0,1305676800,1305676800,0,1,0.0,0.0
4797,39850,375,3.0,1256601600,54,4.092593,1.431038,5.0,55.259259,1.722222,...,3.0,3.0,3.0,0.0,1256601600,1256601600,0,1,0.0,0.0
4798,38039,358,5.0,1341532800,7,5.000000,0.000000,5.0,110.714286,2.000000,...,3.0,3.0,3.0,0.0,1341532800,1341532800,0,1,0.0,0.0


In [35]:
print(new_X_train.shape, new_X_test.shape)

(11200, 21) (4800, 21)


In [36]:
new_X_train.dtypes

user_id                              int64
prod_id                              int64
rating                             float64
date                                 int64
total_reviews_for_restaurant         int64
avg_rating_for_restaurant          float64
std_dev_rating_for_restaurant      float64
median_rating_for_restaurant       float64
review_frequency_for_restaurant    float64
extreme_rating_index               float64
user_restaurants_reviewed            int64
rating_mean                        float64
rating_min                         float64
rating_max                         float64
rating_std                         float64
user_earliest                        int64
user_latest                          int64
user_days_active                     int64
user_review_timespan                 int64
users_avg_per_day                  float64
user_active_percentage             float64
dtype: object

In [37]:
new_X_test.dtypes

user_id                              int64
prod_id                              int64
rating                             float64
date                                 int64
total_reviews_for_restaurant         int32
avg_rating_for_restaurant          float64
std_dev_rating_for_restaurant      float64
median_rating_for_restaurant       float64
review_frequency_for_restaurant    float64
extreme_rating_index               float64
user_restaurants_reviewed            int32
rating_mean                        float64
rating_min                         float64
rating_max                         float64
rating_std                         float64
user_earliest                        int32
user_latest                          int32
user_days_active                     int32
user_review_timespan                 int32
users_avg_per_day                  float64
user_active_percentage             float64
dtype: object

In [38]:
new_X_train.isnull().sum()

user_id                            0
prod_id                            0
rating                             0
date                               0
total_reviews_for_restaurant       0
avg_rating_for_restaurant          0
std_dev_rating_for_restaurant      0
median_rating_for_restaurant       0
review_frequency_for_restaurant    0
extreme_rating_index               0
user_restaurants_reviewed          0
rating_mean                        0
rating_min                         0
rating_max                         0
rating_std                         0
user_earliest                      0
user_latest                        0
user_days_active                   0
user_review_timespan               0
users_avg_per_day                  0
user_active_percentage             0
dtype: int64

In [39]:
new_X_test.isnull().sum()

user_id                            0
prod_id                            0
rating                             0
date                               0
total_reviews_for_restaurant       0
avg_rating_for_restaurant          0
std_dev_rating_for_restaurant      0
median_rating_for_restaurant       0
review_frequency_for_restaurant    0
extreme_rating_index               0
user_restaurants_reviewed          0
rating_mean                        0
rating_min                         0
rating_max                         0
rating_std                         0
user_earliest                      0
user_latest                        0
user_days_active                   0
user_review_timespan               0
users_avg_per_day                  0
user_active_percentage             0
dtype: int64

# Feature Scaling (StandardScaler)

In [40]:
#TODO

# PCA

In [41]:
#TODO

# Train model
temporary model for now

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Train logistic regression model
clf = LogisticRegression()
clf.fit(new_X_train, y_train)

# Make predictions
y_pred = clf.predict(new_X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8875


In [None]:
# checking versions (sync with kaggle if using)
import sys
import numpy as np
import pandas as pd
import sklearn

print("Python Version:", sys.version)
print("NumPy Version:", np.__version__)
print("Pandas Version:", pd.__version__)
print("Scikit-Learn Version:", sklearn.__version__)

Python Version: 3.12.0 (tags/v3.12.0:0fb18b0, Oct  2 2023, 13:03:39) [MSC v.1935 64 bit (AMD64)]
NumPy Version: 1.26.3
Pandas Version: 2.2.1
Scikit-Learn Version: 1.5.0
