In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.metrics import balanced_accuracy_score, matthews_corrcoef, precision_recall_curve, auc

# Import Dataset
Import and concat metadata and reviews. Ratings without reviews were dropped to align with content modelling.

In [20]:
df = pd.read_csv('../00_dataset/YelpZip/metadata', 
                 sep='\t',
                 header=None,
                 names=["user_id", "prod_id", "rating", "label", "date"])
reviews_df = pd.read_csv('../00_dataset/YelpZip/reviewContent',
                sep='\t',
                header=None,
                names=['user_id', 'prod_id', 'date', 'review'])

df = df.merge(reviews_df,
              left_on=['user_id', 'prod_id', 'date'],
              right_on=['user_id', 'prod_id', 'date'],
              how='left')
df = df.dropna(subset=['review'])
df

Unnamed: 0,user_id,prod_id,rating,label,date,review
0,5044,0,1.0,-1,2014-11-16,"Drinks were bad, the hot chocolate was watered..."
1,5045,0,1.0,-1,2014-09-08,This was the worst experience I've ever had a ...
2,5046,0,3.0,-1,2013-10-06,This is located on the site of the old Spruce ...
3,5047,0,5.0,-1,2014-11-30,I enjoyed coffee and breakfast twice at Toast ...
4,5048,0,5.0,-1,2014-08-28,I love Toast! The food choices are fantastic -...
...,...,...,...,...,...,...
608593,119664,5039,4.0,1,2013-01-20,When I first moved to the area I must say I wa...
608594,56277,5039,2.0,1,2012-11-12,Kind of pricey. I guess I expected a ridiculou...
608595,265320,5039,1.0,1,2012-08-22,"Stopped by this restaurant yesterday, we just ..."
608596,161722,5039,4.0,1,2011-05-11,Finally checked out The Best Subs in Claremont...


# Dataset Pre-processing
Minimal cleanup because feature engineering will be using and converting date values.

In [21]:
# Check for Null values
df.isnull().sum()

user_id    0
prod_id    0
rating     0
label      0
date       0
review     0
dtype: int64

In [22]:
df['label'] = df['label'].replace({1: 0, -1: 1})
y = df['label']
X = df.drop('label', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=42, stratify=y_train)

In [23]:
print(f"""
X_train: {X_train.shape}
X_validation: {X_val.shape}
X_test: {X_test.shape}
y_train: {y_train.shape}
y_validation: {y_val.shape}
y_test: {y_test.shape}
""")


X_train: (425920, 5)
X_validation: (60846, 5)
X_test: (121692, 5)
y_train: (425920,)
y_validation: (60846,)
y_test: (121692,)



# Define Models

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier  # Boosting
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier

models = {
    "LR": LogisticRegression(),
    "NN": MLPClassifier(),
    "KNN": KNeighborsClassifier(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "AB": AdaBoostClassifier(),
    "XGB": XGBClassifier(),
    "NB": GaussianNB(),
}

for category, model in models.items():
    print(f"{category}: {model.__class__.__name__}")

LR: LogisticRegression
NN: MLPClassifier
KNN: KNeighborsClassifier
DT: DecisionTreeClassifier
RF: RandomForestClassifier
AB: AdaBoostClassifier
XGB: XGBClassifier
NB: GaussianNB


# Create pipeline

In [25]:
results = []

In [None]:
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from FeatureEngineer import CombinedEngineer
from SmoteTransformer import SMOTETransformer

prob_df = pd.DataFrame()
test_on = ["FSS", "FMS", "SS"]

for test in test_on:

    for name, model in models.items():
        print(f"=== Model: {name} ===")
        if test == "FSS":
            pipeline = Pipeline([
                ('feature_engineering', CombinedEngineer()),  # Feature engineering
                ('scaler', StandardScaler()),  # scaling
                ('smote', SMOTETransformer(sampling_strategy='auto', random_state=42)),  # SMOTE
                ('classifier', model)  # Classifier
            ])
        elif test == "SS":
            X_train['date'] = pd.to_datetime(X_train['date']).astype('int64') // 10**9
            X_val['date'] = pd.to_datetime(X_val['date']).astype('int64') // 10**9
            X_train = X_train.drop(columns=["user_id", "prod_id", "review"], errors='ignore')
            X_val = X_val.drop(columns=["user_id", "prod_id", "review"], errors='ignore')

            pipeline = Pipeline([
                # ('feature_engineering', CombinedEngineer()),  # Feature engineering
                ('scaler', StandardScaler()),  # scaling
                ('smote', SMOTETransformer(sampling_strategy='auto', random_state=42)),  # SMOTE
                ('classifier', model)  # Classifier
            ])
        else:
            # FMS
            pipeline = Pipeline([
                ('feature_engineering', CombinedEngineer()),  # Feature engineering
                ('scaler', MinMaxScaler()),  # scaling
                ('smote', SMOTETransformer(sampling_strategy='auto', random_state=42)),  # SMOTE
                ('classifier', model)  # Classifier
            ])

        # Fit and evaluate the model
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_val)

        # Evaluation metrics
        accuracy = accuracy_score(y_val, y_pred)
        precision = precision_score(y_val, y_pred, pos_label=1)
        recall = recall_score(y_val, y_pred, pos_label=1)
        f1 = f1_score(y_val, y_pred, average='binary')
        print(f"Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")

        # Balanced accuracy
        balanced_acc = balanced_accuracy_score(y_val, y_pred)
        print(f"Balanced Accuracy: {balanced_acc:.3f}")

        # MCC
        mcc = matthews_corrcoef(y_val, y_pred)
        
        # PR-AUC
        y_pred_proba = pipeline.predict_proba(X_val)


        # ROC-AUC for both classes (Class 0 and Class 1)
        roc_auc_0 = roc_auc_score(y_val, y_pred_proba[:, 0])  # ROC-AUC for Class 0 (majority class)
        roc_auc_1 = roc_auc_score(y_val, y_pred_proba[:, 1])  # ROC-AUC for Class 1 (minority class)

        # PR-AUC for both classes
        precision_0, recall_0, _ = precision_recall_curve(y_val, y_pred_proba[:, 0])  # For Class 0
        pr_auc_0 = auc(recall_0, precision_0)

        precision_1, recall_1, _ = precision_recall_curve(y_val, y_pred_proba[:, 1])  # For Class 1
        pr_auc_1 = auc(recall_1, precision_1)

        # Convert to DataFrame
        df_probs = pd.DataFrame(y_pred_proba, columns=[f"{name}_prob_{i}" for i in range(y_pred_proba.shape[1])])

        # Add to the main DataFrame
        prob_df = pd.concat([prob_df, df_probs], axis=1)

        results.append({
            "Model": f"{name}_{test}",
            "Accuracy": f"{accuracy:.4f}",
            "Precision": f"{precision:.4f}",
            "Recall": f"{recall:.4f}",
            "F1 Score": f"{f1:.4f}",
            "Balanced Accuracy": f"{balanced_acc:.4f}",
            "MCC": f"{mcc:.4f}",
            "PR-AUC_0": f"{pr_auc_0:.4f}",
            "PR-AUC_1": f"{pr_auc_1:.4f}",
            "ROC-AUC_0": f"{roc_auc_0:.4f}",
            "ROC-AUC_1": f"{roc_auc_1:.4f}"
        })

prob_df["Actual"] = y_val.values
prob_df.to_csv("b2_model_probabilities.csv")

=== Model: LR ===
Accuracy: 0.583, Precision: 0.228, Recall: 0.906, F1: 0.365
Balanced Accuracy: 0.720
=== Model: NN ===




Accuracy: 0.779, Precision: 0.274, Recall: 0.409, F1: 0.328
Balanced Accuracy: 0.622
=== Model: KNN ===
Accuracy: 0.715, Precision: 0.249, Recall: 0.574, F1: 0.348
Balanced Accuracy: 0.655
=== Model: DT ===
Accuracy: 0.784, Precision: 0.290, Recall: 0.439, F1: 0.349
Balanced Accuracy: 0.637
=== Model: RF ===
Accuracy: 0.810, Precision: 0.346, Recall: 0.491, F1: 0.406
Balanced Accuracy: 0.675
=== Model: AB ===
Accuracy: 0.582, Precision: 0.229, Recall: 0.910, F1: 0.366
Balanced Accuracy: 0.721
=== Model: XGB ===
Accuracy: 0.656, Precision: 0.257, Recall: 0.846, F1: 0.394
Balanced Accuracy: 0.737
=== Model: NB ===
Accuracy: 0.545, Precision: 0.216, Recall: 0.932, F1: 0.351
Balanced Accuracy: 0.709
=== Model: LR ===
Accuracy: 0.583, Precision: 0.228, Recall: 0.906, F1: 0.365
Balanced Accuracy: 0.720
=== Model: NN ===




Accuracy: 0.617, Precision: 0.240, Recall: 0.875, F1: 0.377
Balanced Accuracy: 0.726
=== Model: KNN ===
Accuracy: 0.716, Precision: 0.248, Recall: 0.566, F1: 0.345
Balanced Accuracy: 0.653
=== Model: DT ===
Accuracy: 0.787, Precision: 0.291, Recall: 0.424, F1: 0.345
Balanced Accuracy: 0.633
=== Model: RF ===
Accuracy: 0.820, Precision: 0.356, Recall: 0.452, F1: 0.398
Balanced Accuracy: 0.664
=== Model: AB ===
Accuracy: 0.581, Precision: 0.228, Recall: 0.909, F1: 0.365
Balanced Accuracy: 0.720
=== Model: XGB ===
Accuracy: 0.700, Precision: 0.272, Recall: 0.761, F1: 0.401
Balanced Accuracy: 0.726
=== Model: NB ===
Accuracy: 0.542, Precision: 0.216, Recall: 0.935, F1: 0.350
Balanced Accuracy: 0.708
=== Model: LR ===
Accuracy: 0.561, Precision: 0.130, Recall: 0.406, F1: 0.196
Balanced Accuracy: 0.495
=== Model: NN ===
Accuracy: 0.537, Precision: 0.174, Recall: 0.669, F1: 0.277
Balanced Accuracy: 0.593
=== Model: KNN ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.868, Precision: 0.000, Recall: 0.000, F1: 0.000
Balanced Accuracy: 0.500
=== Model: DT ===
Accuracy: 0.537, Precision: 0.174, Recall: 0.669, F1: 0.277
Balanced Accuracy: 0.593
=== Model: RF ===
Accuracy: 0.537, Precision: 0.174, Recall: 0.669, F1: 0.277
Balanced Accuracy: 0.593
=== Model: AB ===
Accuracy: 0.537, Precision: 0.174, Recall: 0.669, F1: 0.277
Balanced Accuracy: 0.593
=== Model: XGB ===
Accuracy: 0.537, Precision: 0.174, Recall: 0.669, F1: 0.277
Balanced Accuracy: 0.593
=== Model: NB ===
Accuracy: 0.797, Precision: 0.224, Recall: 0.218, F1: 0.221
Balanced Accuracy: 0.551


## Results

In [28]:
# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Sort the results by "Model" alphabetically
results_df = results_df.sort_values(by="Model")

# Display the results table
print("\n=== Model Evaluation Results ===")
print(results_df)


=== Model Evaluation Results ===
      Model Accuracy Precision  Recall F1 Score Balanced Accuracy      MCC  \
13   AB_FMS   0.5813    0.2281  0.9091   0.3647            0.7202   0.2988   
5    AB_FSS   0.5823    0.2287  0.9105   0.3656            0.7214   0.3004   
21    AB_SS   0.5372    0.1743  0.6694   0.2766            0.5932   0.1263   
11   DT_FMS   0.7870    0.2907  0.4243   0.3450            0.6333   0.2288   
3    DT_FSS   0.7837    0.2898  0.4386   0.3490            0.6374   0.2327   
19    DT_SS   0.5372    0.1743  0.6694   0.2766            0.5932   0.1263   
10  KNN_FMS   0.7163    0.2485  0.5660   0.3454            0.6526   0.2254   
2   KNN_FSS   0.7153    0.2494  0.5740   0.3477            0.6554   0.2288   
18   KNN_SS   0.8678    0.0000  0.0000   0.0000            0.5000   0.0000   
8    LR_FMS   0.5830    0.2284  0.9055   0.3647            0.7197   0.2980   
0    LR_FSS   0.5830    0.2285  0.9063   0.3649            0.7200   0.2984   
16    LR_SS   0.5612    0.1296

In [29]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Balanced Accuracy,MCC,PR-AUC_0,PR-AUC_1,ROC-AUC_0,ROC-AUC_1
13,AB_FMS,0.5813,0.2281,0.9091,0.3647,0.7202,0.2988,0.0802,0.3044,0.2239,0.7761
5,AB_FSS,0.5823,0.2287,0.9105,0.3656,0.7214,0.3004,0.0803,0.2994,0.2224,0.7776
21,AB_SS,0.5372,0.1743,0.6694,0.2766,0.5932,0.1263,0.1347,0.2735,0.3804,0.6196
11,DT_FMS,0.787,0.2907,0.4243,0.345,0.6333,0.2288,0.361,0.3953,0.3656,0.6344
3,DT_FSS,0.7837,0.2898,0.4386,0.349,0.6374,0.2327,0.3539,0.4011,0.3613,0.6387
19,DT_SS,0.5372,0.1743,0.6694,0.2766,0.5932,0.1263,0.1347,0.2735,0.3804,0.6196
10,KNN_FMS,0.7163,0.2485,0.566,0.3454,0.6526,0.2254,0.1774,0.3152,0.2887,0.7113
2,KNN_FSS,0.7153,0.2494,0.574,0.3477,0.6554,0.2288,0.1825,0.3201,0.2909,0.7091
18,KNN_SS,0.8678,0.0,0.0,0.0,0.5,0.0,0.4646,0.255,0.4454,0.5546
8,LR_FMS,0.583,0.2284,0.9055,0.3647,0.7197,0.298,0.0801,0.3074,0.2205,0.7795


# K-Fold Cross Validation

In [None]:
# Split into train & test only, as cross-validation would automatically split validation within the train
X_train_kf, X_test, y_train_kf, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from FeatureEngineer import CombinedEngineer
from SmoteTransformer import SMOTETransformer
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_predict
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    balanced_accuracy_score, matthews_corrcoef, roc_auc_score, precision_recall_curve, auc,
    make_scorer
)
import numpy as np

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
prob_df = pd.DataFrame()
results = []

for name, model in models.items():
    print(f"=== Model: {name} ===")

    # Define pipeline
    pipeline = Pipeline([
        ('feature_engineering', CombinedEngineer()),  # Feature engineering
        ('scaler', StandardScaler()),  # Scaling
        ('smote', SMOTETransformer(sampling_strategy='auto', random_state=42)),  # SMOTE
        ('classifier', model)  # Classifier
    ])

    # Define scoring metrics
    scoring = {
        'accuracy': 'accuracy',
        'precision': 'precision',
        'recall': 'recall',
        'f1': 'f1',
        'balanced_accuracy': make_scorer(balanced_accuracy_score),
        'mcc': make_scorer(matthews_corrcoef)
    }

    # Perform cross-validation
    cv_results = cross_validate(pipeline, X_train_kf, y_train_kf, cv=cv, scoring=scoring, return_train_score=False)

    # Get predicted probabilities for PR-AUC & ROC-AUC
    y_pred_proba = cross_val_predict(pipeline, X_train_kf, y_train_kf, cv=cv, method="predict_proba")

    # Compute additional metrics
    roc_auc_0 = roc_auc_score(y_train_kf, y_pred_proba[:, 0])
    roc_auc_1 = roc_auc_score(y_train_kf, y_pred_proba[:, 1])

    precision_0, recall_0, _ = precision_recall_curve(y_train_kf, y_pred_proba[:, 0])
    pr_auc_0 = auc(recall_0, precision_0)

    precision_1, recall_1, _ = precision_recall_curve(y_train_kf, y_pred_proba[:, 1])
    pr_auc_1 = auc(recall_1, precision_1)

    print(f"Accuracy: {np.mean(cv_results['test_accuracy']):.3f}, Precision: {np.mean(cv_results['test_precision']):.3f}, Recall: {np.mean(cv_results['test_recall']):.3f}, F1: {np.mean(cv_results['test_f1']):.3f}")
    print(f"Balanced Accuracy: {np.mean(cv_results['test_balanced_accuracy']):.3f}")
    # Store metrics
    results.append({
        "Model": name,
        "Accuracy": f"{np.mean(cv_results['test_accuracy']):.4f} ± {np.std(cv_results['test_accuracy']):.4f}",
        "Precision": f"{np.mean(cv_results['test_precision']):.4f} ± {np.std(cv_results['test_precision']):.4f}",
        "Recall": f"{np.mean(cv_results['test_recall']):.4f} ± {np.std(cv_results['test_recall']):.4f}",
        "F1 Score": f"{np.mean(cv_results['test_f1']):.4f} ± {np.std(cv_results['test_f1']):.4f}",
        "Balanced Accuracy": f"{np.mean(cv_results['test_balanced_accuracy']):.4f} ± {np.std(cv_results['test_balanced_accuracy']):.4f}",
        "MCC": f"{np.mean(cv_results['test_mcc']):.4f} ± {np.std(cv_results['test_mcc']):.4f}",
        "PR-AUC_0": f"{pr_auc_0:.4f}",
        "PR-AUC_1": f"{pr_auc_1:.4f}",
        "ROC-AUC_0": f"{roc_auc_0:.4f}",
        "ROC-AUC_1": f"{roc_auc_1:.4f}"
    })

    df_probs = pd.DataFrame(y_pred_proba, columns=[f"{name}_prob_{i}" for i in range(y_pred_proba.shape[1])])
    prob_df = pd.concat([prob_df, df_probs], axis=1)

# Save results
df_results = pd.DataFrame(results)
df_results.to_csv("b2_cross_validation_results_kfold.csv", index=False)

=== Model: LR ===
Accuracy: 0.571, Precision: 0.224, Recall: 0.911, F1: 0.360
Balanced Accuracy: 0.715
=== Model: NN ===




Accuracy: 0.765, Precision: 0.233, Recall: 0.345, F1: 0.275
Balanced Accuracy: 0.587
=== Model: KNN ===
Accuracy: 0.711, Precision: 0.243, Recall: 0.563, F1: 0.340
Balanced Accuracy: 0.648
=== Model: DT ===
Accuracy: 0.783, Precision: 0.286, Recall: 0.428, F1: 0.343
Balanced Accuracy: 0.633
=== Model: RF ===
Accuracy: 0.809, Precision: 0.341, Recall: 0.479, F1: 0.399
Balanced Accuracy: 0.669
=== Model: AB ===
Accuracy: 0.570, Precision: 0.224, Recall: 0.915, F1: 0.360
Balanced Accuracy: 0.716
=== Model: XGB ===
Accuracy: 0.634, Precision: 0.249, Recall: 0.874, F1: 0.387
Balanced Accuracy: 0.736
=== Model: NB ===
Accuracy: 0.538, Precision: 0.214, Recall: 0.935, F1: 0.348
Balanced Accuracy: 0.706


#### Clean up results and export to csv

In [31]:
# Remove the ± symbol and the standard deviation
df_results_cleaned = df_results.copy()
for col in ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'Balanced Accuracy', 'MCC']:
    df_results_cleaned[col] = df_results_cleaned[col].str.split(' ± ').str[0]

df_results_cleaned.to_csv("b2_cross_validation_results_clean.csv")

In [32]:
# checking versions (sync with kaggle if using)
import sys
import numpy as np
import pandas as pd
import sklearn

print("Python Version:", sys.version)
print("NumPy Version:", np.__version__)
print("Pandas Version:", pd.__version__)
print("Scikit-Learn Version:", sklearn.__version__)

Python Version: 3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]
NumPy Version: 2.2.3
Pandas Version: 2.2.3
Scikit-Learn Version: 1.6.1
