In [None]:
import pandas as pd

# Load the results file
file_path = "b4_combined_results.csv"
results_df = pd.read_csv(file_path)

# Define weights for the selection formula
alpha = 0.5  # Precision weight
beta = 0.3   # Recall weight
gamma = 0.05  # Balanced accuracy 
delta = 0.05  # MCC
epsilon = 0.05  # F1-score
zeta = 0.05  # PR-AUC_0

# Convert relevant columns to float
for metric in ["Precision", "Recall", "Balanced Accuracy", "MCC", "F1 Score", "PR-AUC_0"]:
    results_df[metric] = results_df[metric].astype(float)

# Compute the updated selection score
results_df["Score"] = (
    alpha * results_df["Precision"] +
    beta * results_df["Recall"] +
    gamma * results_df["Balanced Accuracy"] +
    delta * results_df["MCC"] +
    epsilon * results_df["F1 Score"] +
    zeta * results_df["PR-AUC_0"]
)

# Select the model with the highest score
best_model = results_df.loc[results_df["Score"].idxmax()]

# Print the best model and feature selection method
print("Best Model for Fake Review Detection (Using Enhanced Scoring Formula):")
print(best_model[["Model", "Precision", "Recall", "Balanced Accuracy", "MCC", "F1 Score", "PR-AUC_0", "Score"]])

# Sort models and select the top 5
top_5_models = results_df.sort_values(by="Score", ascending=False).head(5)

print(top_5_models[["Model", "Precision", "Recall", "Balanced Accuracy", "MCC", "F1 Score", "PR-AUC_0", "Score"]])


Best Model for Fake Review Detection (Using Enhanced Scoring Formula):
Model                RF_MI_FMS
Precision               0.2318
Recall                  0.9614
Balanced Accuracy        0.738
MCC                      0.324
F1 Score                0.3735
PR-AUC_0                0.0838
Score                 0.480285
Name: 34, dtype: object
               Model  Precision  Recall  Balanced Accuracy     MCC  F1 Score  \
34         RF_MI_FMS     0.2318  0.9614             0.7380  0.3240    0.3735   
26         RF_MI_FSS     0.2318  0.9611             0.7379  0.3239    0.3735   
50   RF_Lasso_MI_FMS     0.2320  0.9603             0.7380  0.3238    0.3737   
42   RF_Lasso_MI_FSS     0.2323  0.9401             0.7334  0.3173    0.3725   
44  XGB_Lasso_MI_FSS     0.2264  0.9358             0.7243  0.3052    0.3646   

    PR-AUC_0     Score  
34    0.0838  0.480285  
26    0.0839  0.480190  
50    0.0841  0.480070  
42    0.0836  0.473520  
44    0.0801  0.467650  


In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.metrics import balanced_accuracy_score, matthews_corrcoef, precision_recall_curve, auc

In [60]:
df = pd.read_csv('../00_dataset/YelpZip/metadata', 
                 sep='\t',
                 header=None,
                 names=["user_id", "prod_id", "rating", "label", "date"])
reviews_df = pd.read_csv('../00_dataset/YelpZip/reviewContent',
                sep='\t',
                header=None,
                names=['user_id', 'prod_id', 'date', 'review'])

df = df.merge(reviews_df,
              left_on=['user_id', 'prod_id', 'date'],
              right_on=['user_id', 'prod_id', 'date'],
              how='left')
df = df.dropna(subset=['review'])
df

Unnamed: 0,user_id,prod_id,rating,label,date,review
0,5044,0,1.0,-1,2014-11-16,"Drinks were bad, the hot chocolate was watered..."
1,5045,0,1.0,-1,2014-09-08,This was the worst experience I've ever had a ...
2,5046,0,3.0,-1,2013-10-06,This is located on the site of the old Spruce ...
3,5047,0,5.0,-1,2014-11-30,I enjoyed coffee and breakfast twice at Toast ...
4,5048,0,5.0,-1,2014-08-28,I love Toast! The food choices are fantastic -...
...,...,...,...,...,...,...
608593,119664,5039,4.0,1,2013-01-20,When I first moved to the area I must say I wa...
608594,56277,5039,2.0,1,2012-11-12,Kind of pricey. I guess I expected a ridiculou...
608595,265320,5039,1.0,1,2012-08-22,"Stopped by this restaurant yesterday, we just ..."
608596,161722,5039,4.0,1,2011-05-11,Finally checked out The Best Subs in Claremont...


In [61]:
df['label'] = df['label'].replace({1: 0, -1: 1})
y = df['label']
X = df.drop('label', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=42, stratify=y_train)

In [62]:
print(f"""
X_train: {X_train.shape}
X_validation: {X_val.shape}
X_test: {X_test.shape}
y_train: {y_train.shape}
y_validation: {y_val.shape}
y_test: {y_test.shape}
""")


X_train: (425920, 5)
X_validation: (60846, 5)
X_test: (121692, 5)
y_train: (425920,)
y_validation: (60846,)
y_test: (121692,)



In [63]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

chosen_model = XGBClassifier()

In [64]:
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from FeatureEngineer import CombinedEngineer
from SmoteTransformer import SMOTETransformer

pipeline = Pipeline([
    ('feature_engineering', CombinedEngineer(drop_columns=['user_earliest', 'extreme_rating_index', 'total_reviews_for_restaurant', 'avg_rating_for_restaurant', 'review_frequency_for_restaurant', 'user_latest', 'std_dev_rating_for_restaurant', 'rating_min', 'date'])),  # Feature engineering
    ('scaler', MinMaxScaler()),  # scaling
    ('smote', SMOTETransformer(sampling_strategy='auto', random_state=42)),  # SMOTE
    ('classifier', chosen_model)  # Classifier
])

In [65]:
# Fit model
pipeline.fit(X_train, y_train)

In [66]:
# Evaluate model
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)

In [67]:
# Get results and convert to DataFrame
df_probs = pd.DataFrame(y_pred_proba, columns=[f"Prob (Class {i})" for i in range(y_pred_proba.shape[1])])

# Add actual labels and predictions to the DataFrame
df_probs['Actual Label'] = y_test.values
df_probs['Predicted Label'] = y_pred

In [68]:
df_probs.to_csv("b5_model_probabilities_RF_MI_FMS.csv")