### Import Libraries and Functions

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.metrics import balanced_accuracy_score, matthews_corrcoef, precision_recall_curve, auc

### Load Datasets

In [7]:
df = pd.read_csv('../00_dataset/YelpZip/metadata', 
                 sep='\t',
                 header=None,
                 names=["user_id", "prod_id", "rating", "label", "date"])
reviews_df = pd.read_csv('../00_dataset/YelpZip/reviewContent',
                sep='\t',
                header=None,
                names=['user_id', 'prod_id', 'date', 'review'])

df = df.merge(reviews_df,
              left_on=['user_id', 'prod_id', 'date'],
              right_on=['user_id', 'prod_id', 'date'],
              how='left')
df = df.dropna(subset=['review'])
df

Unnamed: 0,user_id,prod_id,rating,label,date,review
0,5044,0,1.0,-1,2014-11-16,"Drinks were bad, the hot chocolate was watered..."
1,5045,0,1.0,-1,2014-09-08,This was the worst experience I've ever had a ...
2,5046,0,3.0,-1,2013-10-06,This is located on the site of the old Spruce ...
3,5047,0,5.0,-1,2014-11-30,I enjoyed coffee and breakfast twice at Toast ...
4,5048,0,5.0,-1,2014-08-28,I love Toast! The food choices are fantastic -...
...,...,...,...,...,...,...
608593,119664,5039,4.0,1,2013-01-20,When I first moved to the area I must say I wa...
608594,56277,5039,2.0,1,2012-11-12,Kind of pricey. I guess I expected a ridiculou...
608595,265320,5039,1.0,1,2012-08-22,"Stopped by this restaurant yesterday, we just ..."
608596,161722,5039,4.0,1,2011-05-11,Finally checked out The Best Subs in Claremont...


In [8]:
# Check for Null values
df.isnull().sum()

user_id    0
prod_id    0
rating     0
label      0
date       0
review     0
dtype: int64

In [None]:
df['label'] = df['label'].replace({1: 0, -1: 1})
y = df['label']
X = df.drop('label', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125, random_state=42, stratify=y_train)

### Check Shape

In [None]:
print(f"""
X_train: {X_train.shape}
X_validation: {X_val.shape}
X_test: {X_test.shape}
y_train: {y_train.shape}
y_validation: {y_val.shape}
y_test: {y_test.shape}
""")


X_train: (425920, 5)
X_validation: (60846, 5)
X_test: (121692, 5)
y_train: (425920,)
y_validation: (60846,)
y_test: (60846,)



### Define Models

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier  # Boosting
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier

models = {
    "LR": LogisticRegression(),
    "NN": MLPClassifier(),
    "KNN": KNeighborsClassifier(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "AB": AdaBoostClassifier(),
    "XGB": XGBClassifier(),
    "NB": GaussianNB(),
}

# Example usage: print model names
for category, model in models.items():
    print(f"{category}: {model.__class__.__name__}")

LR: LogisticRegression
NN: MLPClassifier
KNN: KNeighborsClassifier
DT: DecisionTreeClassifier
RF: RandomForestClassifier
AB: AdaBoostClassifier
XGB: XGBClassifier
NB: GaussianNB


In [12]:
results = []

### Print what the pipeline does

In [18]:
# Define drop_columns_dict for feature selection methods
drop_columns_dict = {
    "baseline": set(),
    "MI": {'user_earliest', 'extreme_rating_index', 'total_reviews_for_restaurant', 'avg_rating_for_restaurant', 
           'review_frequency_for_restaurant', 'user_latest', 'std_dev_rating_for_restaurant', 'rating_min', 'date'},
    "Lasso_MI": {'user_earliest', 'extreme_rating_index', 'total_reviews_for_restaurant', 'avg_rating_for_restaurant', 
                 'review_frequency_for_restaurant', 'user_latest', 'std_dev_rating_for_restaurant', 'rating_min', 'date', 
                 'user_restaurants_reviewed'},
    "RFE": {'rating_max', 'median_rating_for_restaurant', 'extreme_rating_index', 'rating', 'user_restaurants_reviewed', 
            'rating_std', 'user_active_percentage', 'user_days_active', 'rating_min'},
    "Lasso_RFE": {'rating_max', 'median_rating_for_restaurant', 'extreme_rating_index', 'rating', 'user_restaurants_reviewed', 
                  'rating_std', 'user_active_percentage', 'user_days_active', 'rating_min', 'user_earliest'},
    "RFECV": {'rating_max', 'median_rating_for_restaurant', 'rating'},
    "Lasso_RFECV": {'rating_max', 'median_rating_for_restaurant', 'rating', 'user_earliest', 'user_restaurants_reviewed'}
}

# Define pipelines
pipelines = ["FSS", "FMS", "SS"]

# Print out what will be fed into each pipeline
for p in pipelines:
    print(f"\n=== Pipeline: {p} ===")

    if p == "SS":
        print("- Uses the dataset **without feature engineering**")
        print("- Columns dropped: ['user_id', 'prod_id', 'review']")
        print("- `date` is converted to UNIX timestamp")
    else:
        print("- Uses **feature-engineered dataset**")
        print("- Scaling: StandardScaler (FSS) or MinMaxScaler (FMS)")
        print("- Feature selection applied for each method below:")

        for feature_selection_method, drop_columns in drop_columns_dict.items():
            print(f"  - Feature Selection: {feature_selection_method}")
            print(f"    - Columns dropped: {sorted(drop_columns) if drop_columns else 'None'}")
    
print("\n=== Baseline Model ===")
print("- Uses **all features**, no feature selection applied")
print("- Uses feature-engineered dataset")
print("- Scaling: StandardScaler / MinMaxScaler")



=== Pipeline: FSS ===
- Uses **feature-engineered dataset**
- Scaling: StandardScaler (FSS) or MinMaxScaler (FMS)
- Feature selection applied for each method below:
  - Feature Selection: baseline
    - Columns dropped: None
  - Feature Selection: MI
    - Columns dropped: ['avg_rating_for_restaurant', 'date', 'extreme_rating_index', 'rating_min', 'review_frequency_for_restaurant', 'std_dev_rating_for_restaurant', 'total_reviews_for_restaurant', 'user_earliest', 'user_latest']
  - Feature Selection: Lasso_MI
    - Columns dropped: ['avg_rating_for_restaurant', 'date', 'extreme_rating_index', 'rating_min', 'review_frequency_for_restaurant', 'std_dev_rating_for_restaurant', 'total_reviews_for_restaurant', 'user_earliest', 'user_latest', 'user_restaurants_reviewed']
  - Feature Selection: RFE
    - Columns dropped: ['extreme_rating_index', 'median_rating_for_restaurant', 'rating', 'rating_max', 'rating_min', 'rating_std', 'user_active_percentage', 'user_days_active', 'user_restaurants_re

### Run Pipeline (Only Baseline FMS and FSS are saved)
<br>
Baseline: All Engineered Features
<br>
MI: Features Selected by Mutual Interest
<br>
Lasso_MI: MI Features that are lassoed
<br>
RFE: Features Selected by Recursive Feature Extraction
<br>
Lasso_RFE: RFE Features that are lassoed
<br>
RFECV: Features selected by RFECV (RFE Cross Validation)
<br>
Lasso_RFECV: RFECV Features that are lassoed
<br>
<br><br>
FMS: Features, MinMaxScaler, SMOTE
<br>
FSS: Features, StandardScaler, SMOTE

In [None]:
import pandas as pd
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_predict
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    balanced_accuracy_score, matthews_corrcoef, roc_auc_score, precision_recall_curve, auc,
    make_scorer
)
import numpy as np
from FeatureEngineer import CombinedEngineer
from SmoteTransformer import SMOTETransformer

# Define models
models = {
    "LR": LogisticRegression(),
    "NN": MLPClassifier(),
    "KNN": KNeighborsClassifier(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "AB": AdaBoostClassifier(),
    "XGB": XGBClassifier(),
    "NB": GaussianNB(),
}

# Feature selection methods with columns to drop
drop_columns_dict = {
    "baseline": set(),
    "MI": {'user_earliest', 'extreme_rating_index', 'total_reviews_for_restaurant', 'avg_rating_for_restaurant', 
           'review_frequency_for_restaurant', 'user_latest', 'std_dev_rating_for_restaurant', 'rating_min', 'date'},
    "Lasso_MI": {'user_earliest', 'extreme_rating_index', 'total_reviews_for_restaurant', 'avg_rating_for_restaurant', 
                 'review_frequency_for_restaurant', 'user_latest', 'std_dev_rating_for_restaurant', 'rating_min', 'date', 
                 'user_restaurants_reviewed'},
    "RFE": {'rating_max', 'median_rating_for_restaurant', 'extreme_rating_index', 'rating', 'user_restaurants_reviewed', 
            'rating_std', 'user_active_percentage', 'user_days_active', 'rating_min'},
    "Lasso_RFE": {'rating_max', 'median_rating_for_restaurant', 'extreme_rating_index', 'rating', 'user_restaurants_reviewed', 
                  'rating_std', 'user_active_percentage', 'user_days_active', 'rating_min', 'user_earliest'},
    "RFECV": {'rating_max', 'median_rating_for_restaurant', 'rating'},
    "Lasso_RFECV": {'rating_max', 'median_rating_for_restaurant', 'rating', 'user_earliest', 'user_restaurants_reviewed'}
}

# Stratified K-Fold setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store results
results = []
prob_df = pd.DataFrame()

# Iterate over Feature Selection methods
for feature_selection_method, drop_columns in drop_columns_dict.items():
    
    # Iterate over pipeline types
    for p in ["FSS", "FMS", "SS"]:
        
        # SS only runs once (no feature engineering)
        if p == "SS" and feature_selection_method != "baseline":
            continue

        for name, model in models.items():
            print(f"=== Model: {name} | Feature Selection: {feature_selection_method} | Pipeline: {p} ===")

            if p == "FSS":
                pipeline = Pipeline([
                    ('feature_engineering', CombinedEngineer(drop_columns=list(drop_columns))),  # Feature engineering with column removal
                    ('scaler', StandardScaler()),  # Standard Scaling
                    ('smote', SMOTETransformer(sampling_strategy='auto', random_state=42)),  # SMOTE
                    ('classifier', model)  # Model
                ])
            elif p == "SS":
                # Preprocess dataset only once for SS
                X_train_copy = X_train.copy()
                X_val_copy = X_val.copy()
                
                X_train_copy['date'] = pd.to_datetime(X_train_copy['date']).astype('int64') // 10**9
                X_val_copy['date'] = pd.to_datetime(X_val_copy['date']).astype('int64') // 10**9
                X_train_copy = X_train_copy.drop(columns=["user_id", "prod_id", "review"], errors='ignore')
                X_val_copy = X_val_copy.drop(columns=["user_id", "prod_id", "review"], errors='ignore')

                pipeline = Pipeline([
                    ('scaler', StandardScaler()),  # Standard Scaling
                    ('smote', SMOTETransformer(sampling_strategy='auto', random_state=42)),  # SMOTE
                    ('classifier', model)  # Model
                ])
            else:
                # FMS (Feature Engineering + MinMaxScaler)
                pipeline = Pipeline([
                    ('feature_engineering', CombinedEngineer(drop_columns=list(drop_columns))),  # Feature engineering
                    ('scaler', MinMaxScaler()),  # MinMax Scaling
                    ('smote', SMOTETransformer(sampling_strategy='auto', random_state=42)),  # SMOTE
                    ('classifier', model)  # Model
                ])

            # Define scoring metrics
            scoring = {
                'accuracy': 'accuracy',
                'precision': 'precision',
                'recall': 'recall',
                'f1': 'f1',
                'balanced_accuracy': make_scorer(balanced_accuracy_score),
                'mcc': make_scorer(matthews_corrcoef)
            }

            # Perform cross-validation
            cv_results = cross_validate(pipeline, X_train, y_train, cv=cv, scoring=scoring, return_train_score=False)

            # Get predicted probabilities for PR-AUC & ROC-AUC
            y_pred_proba = cross_val_predict(pipeline, X_train, y_train, cv=cv, method="predict_proba")

            # Compute additional metrics
            roc_auc_0 = roc_auc_score(y_train, y_pred_proba[:, 0])
            roc_auc_1 = roc_auc_score(y_train, y_pred_proba[:, 1])

            precision_0, recall_0, _ = precision_recall_curve(y_train, y_pred_proba[:, 0])
            pr_auc_0 = auc(recall_0, precision_0)

            precision_1, recall_1, _ = precision_recall_curve(y_train, y_pred_proba[:, 1])
            pr_auc_1 = auc(recall_1, precision_1)

            # Store metrics
            results.append({
                "Model": f"{name}_{feature_selection_method}_{p}",
                "Accuracy": f"{np.mean(cv_results['test_accuracy']):.4f}",
                "Precision": f"{np.mean(cv_results['test_precision']):.4f}",
                "Recall": f"{np.mean(cv_results['test_recall']):.4f}",
                "F1 Score": f"{np.mean(cv_results['test_f1']):.4f}",
                "Balanced Accuracy": f"{np.mean(cv_results['test_balanced_accuracy']):.4f}",
                "MCC": f"{np.mean(cv_results['test_mcc']):.4f}",
                "PR-AUC_0": f"{pr_auc_0:.4f}",
                "PR-AUC_1": f"{pr_auc_1:.4f}",
                "ROC-AUC_0": f"{roc_auc_0:.4f}",
                "ROC-AUC_1": f"{roc_auc_1:.4f}"
            })

# Save results in a single CSV
results_df = pd.DataFrame(results)
results_df.to_csv("b4_pipeline_comparison_results.csv", index=False)

print("All models evaluated and results saved.")


=== Model: LR | Feature Selection: baseline | Pipeline: FSS ===
=== Model: NN | Feature Selection: baseline | Pipeline: FSS ===




=== Model: KNN | Feature Selection: baseline | Pipeline: FSS ===
=== Model: DT | Feature Selection: baseline | Pipeline: FSS ===
=== Model: RF | Feature Selection: baseline | Pipeline: FSS ===
=== Model: AB | Feature Selection: baseline | Pipeline: FSS ===
=== Model: XGB | Feature Selection: baseline | Pipeline: FSS ===
=== Model: NB | Feature Selection: baseline | Pipeline: FSS ===
=== Model: LR | Feature Selection: baseline | Pipeline: FMS ===


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


=== Model: NN | Feature Selection: baseline | Pipeline: FMS ===




=== Model: KNN | Feature Selection: baseline | Pipeline: FMS ===
=== Model: DT | Feature Selection: baseline | Pipeline: FMS ===
=== Model: RF | Feature Selection: baseline | Pipeline: FMS ===
=== Model: AB | Feature Selection: baseline | Pipeline: FMS ===
=== Model: XGB | Feature Selection: baseline | Pipeline: FMS ===
=== Model: NB | Feature Selection: baseline | Pipeline: FMS ===
=== Model: LR | Feature Selection: baseline | Pipeline: SS ===


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\imblearn\pipeline.py", line 518, in fit
    Xt, yt = self._fit(X, y, routed_params, raw_params=params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\imblearn\pipeline.py", line 430, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\imblearn\pipeline.py", line 1383, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\base.py", line 921, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\preprocessing\_data.py", line 894, in fit
    return self.partial_fit(X, y, sample_weight)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\preprocessing\_data.py", line 930, in partial_fit
    X = validate_data(
        ^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\utils\validation.py", line 2944, in validate_data
    out = check_array(X, input_name="X", **check_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\utils\_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: '2011-11-13'

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\imblearn\pipeline.py", line 518, in fit
    Xt, yt = self._fit(X, y, routed_params, raw_params=params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\imblearn\pipeline.py", line 430, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\imblearn\pipeline.py", line 1383, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\base.py", line 921, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\preprocessing\_data.py", line 894, in fit
    return self.partial_fit(X, y, sample_weight)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\preprocessing\_data.py", line 930, in partial_fit
    X = validate_data(
        ^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\utils\validation.py", line 2944, in validate_data
    out = check_array(X, input_name="X", **check_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\sklearn\utils\_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Admin\Documents\GitHub\INF2008_YelpZip\venv\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: '2008-10-13'


### Save baseline results

In [None]:
# Filter out only FSS and FMS pipeline results
filtered_results = [r for r in results if not r["Model"].endswith("_SS")]

# Convert to DataFrame
filtered_results_df = pd.DataFrame(filtered_results)

# Save results before SS to a new CSV
filtered_results_path = "b4_pipeline_completed_FSS_FMS.csv"
filtered_results_df.to_csv(filtered_results_path, index=False)


# Return the saved file path
filtered_results_path


'b5_pipeline_completed_FSS_FMS.csv'

### Run Pipeline for FMS and FSS for Selected Engineered Feature

In [None]:
import pandas as pd
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_predict
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    balanced_accuracy_score, matthews_corrcoef, roc_auc_score, precision_recall_curve, auc,
    make_scorer
)
import numpy as np
from FeatureEngineer import CombinedEngineer
from SmoteTransformer import SMOTETransformer

# Define models
models = {
    "LR": LogisticRegression(),
    "NN": MLPClassifier(),
    "KNN": KNeighborsClassifier(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "AB": AdaBoostClassifier(),
    "XGB": XGBClassifier(),
    "NB": GaussianNB(),
}

# Feature selection methods with columns to drop
drop_columns_dict = {
    "baseline": set(),
    "MI": {'user_earliest', 'extreme_rating_index', 'total_reviews_for_restaurant', 'avg_rating_for_restaurant', 
           'review_frequency_for_restaurant', 'user_latest', 'std_dev_rating_for_restaurant', 'rating_min', 'date'},
    "Lasso_MI": {'user_earliest', 'extreme_rating_index', 'total_reviews_for_restaurant', 'avg_rating_for_restaurant', 
                 'review_frequency_for_restaurant', 'user_latest', 'std_dev_rating_for_restaurant', 'rating_min', 'date', 
                 'user_restaurants_reviewed'},
    "RFE": {'rating_max', 'median_rating_for_restaurant', 'extreme_rating_index', 'rating', 'user_restaurants_reviewed', 
            'rating_std', 'user_active_percentage', 'user_days_active', 'rating_min'},
    "Lasso_RFE": {'rating_max', 'median_rating_for_restaurant', 'extreme_rating_index', 'rating', 'user_restaurants_reviewed', 
                  'rating_std', 'user_active_percentage', 'user_days_active', 'rating_min', 'user_earliest'},
    "RFECV": {'rating_max', 'median_rating_for_restaurant', 'rating'},
    "Lasso_RFECV": {'rating_max', 'median_rating_for_restaurant', 'rating', 'user_earliest', 'user_restaurants_reviewed'}
}

# Stratified K-Fold setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store results
results = []
prob_df = pd.DataFrame()

# Iterate over Feature Selection methods
for feature_selection_method, drop_columns in drop_columns_dict.items():

    # Skip baseline 
    if feature_selection_method == "baseline":
        continue  

    # Iterate over pipeline types (skip SS)
    for p in ["FSS", "FMS"]:


        for name, model in models.items():
            print(f"=== Model: {name} | Feature Selection: {feature_selection_method} | Pipeline: {p} ===")

            if p == "FSS":
                # FSS (Feature Engineering + StandardScaler)
                pipeline = Pipeline([
                    ('feature_engineering', CombinedEngineer(drop_columns=list(drop_columns))),  # Feature engineering with column removal
                    ('scaler', StandardScaler()),  # Standard Scaling
                    ('smote', SMOTETransformer(sampling_strategy='auto', random_state=42)),  # SMOTE
                    ('classifier', model)  # Model
                ])
            elif p == "SS":
                # Preprocess dataset only once for SS
                X_train_copy = X_train.copy()
                X_val_copy = X_val.copy()
                
                X_train_copy['date'] = pd.to_datetime(X_train_copy['date']).astype('int64') // 10**9
                X_val_copy['date'] = pd.to_datetime(X_val_copy['date']).astype('int64') // 10**9
                X_train_copy = X_train_copy.drop(columns=["user_id", "prod_id", "review"], errors='ignore')
                X_val_copy = X_val_copy.drop(columns=["user_id", "prod_id", "review"], errors='ignore')

                pipeline = Pipeline([
                    ('scaler', StandardScaler()),  # Standard Scaling
                    ('smote', SMOTETransformer(sampling_strategy='auto', random_state=42)),  # SMOTE
                    ('classifier', model)  # Model
                ])
            else:
                # FMS (Feature Engineering + MinMaxScaler)
                pipeline = Pipeline([
                    ('feature_engineering', CombinedEngineer(drop_columns=list(drop_columns))),  # Feature engineering
                    ('scaler', MinMaxScaler()),  # MinMax Scaling
                    ('smote', SMOTETransformer(sampling_strategy='auto', random_state=42)),  # SMOTE
                    ('classifier', model)  # Model
                ])

            # Define scoring metrics
            scoring = {
                'accuracy': 'accuracy',
                'precision': 'precision',
                'recall': 'recall',
                'f1': 'f1',
                'balanced_accuracy': make_scorer(balanced_accuracy_score),
                'mcc': make_scorer(matthews_corrcoef)
            }

            # Perform cross-validation
            cv_results = cross_validate(pipeline, X_train, y_train, cv=cv, scoring=scoring, return_train_score=False)

            # Get predicted probabilities for PR-AUC & ROC-AUC
            y_pred_proba = cross_val_predict(pipeline, X_train, y_train, cv=cv, method="predict_proba")

            # Compute additional metrics
            roc_auc_0 = roc_auc_score(y_train, y_pred_proba[:, 0])
            roc_auc_1 = roc_auc_score(y_train, y_pred_proba[:, 1])

            precision_0, recall_0, _ = precision_recall_curve(y_train, y_pred_proba[:, 0])
            pr_auc_0 = auc(recall_0, precision_0)

            precision_1, recall_1, _ = precision_recall_curve(y_train, y_pred_proba[:, 1])
            pr_auc_1 = auc(recall_1, precision_1)

            # Store metrics
            results.append({
                "Model": f"{name}_{feature_selection_method}_{p}",
                "Accuracy": f"{np.mean(cv_results['test_accuracy']):.4f}",
                "Precision": f"{np.mean(cv_results['test_precision']):.4f}",
                "Recall": f"{np.mean(cv_results['test_recall']):.4f}",
                "F1 Score": f"{np.mean(cv_results['test_f1']):.4f}",
                "Balanced Accuracy": f"{np.mean(cv_results['test_balanced_accuracy']):.4f}",
                "MCC": f"{np.mean(cv_results['test_mcc']):.4f}",
                "PR-AUC_0": f"{pr_auc_0:.4f}",
                "PR-AUC_1": f"{pr_auc_1:.4f}",
                "ROC-AUC_0": f"{roc_auc_0:.4f}",
                "ROC-AUC_1": f"{roc_auc_1:.4f}"
            })

# Save results in a single CSV
results_df = pd.DataFrame(results)
results_df.to_csv("b4_pipeline_comparison_results_rest.csv", index=False)

print("All models evaluated and results saved.")


=== Model: LR | Feature Selection: MI | Pipeline: FSS ===
=== Model: NN | Feature Selection: MI | Pipeline: FSS ===
=== Model: KNN | Feature Selection: MI | Pipeline: FSS ===
=== Model: DT | Feature Selection: MI | Pipeline: FSS ===
=== Model: RF | Feature Selection: MI | Pipeline: FSS ===
=== Model: AB | Feature Selection: MI | Pipeline: FSS ===
=== Model: XGB | Feature Selection: MI | Pipeline: FSS ===
=== Model: NB | Feature Selection: MI | Pipeline: FSS ===
=== Model: LR | Feature Selection: MI | Pipeline: FMS ===
=== Model: NN | Feature Selection: MI | Pipeline: FMS ===
=== Model: KNN | Feature Selection: MI | Pipeline: FMS ===
=== Model: DT | Feature Selection: MI | Pipeline: FMS ===
=== Model: RF | Feature Selection: MI | Pipeline: FMS ===
=== Model: AB | Feature Selection: MI | Pipeline: FMS ===
=== Model: XGB | Feature Selection: MI | Pipeline: FMS ===
=== Model: NB | Feature Selection: MI | Pipeline: FMS ===
=== Model: LR | Feature Selection: Lasso_MI | Pipeline: FSS ===
=== 



=== Model: KNN | Feature Selection: RFECV | Pipeline: FSS ===
=== Model: DT | Feature Selection: RFECV | Pipeline: FSS ===
=== Model: RF | Feature Selection: RFECV | Pipeline: FSS ===
=== Model: AB | Feature Selection: RFECV | Pipeline: FSS ===
=== Model: XGB | Feature Selection: RFECV | Pipeline: FSS ===
=== Model: NB | Feature Selection: RFECV | Pipeline: FSS ===
=== Model: LR | Feature Selection: RFECV | Pipeline: FMS ===
=== Model: NN | Feature Selection: RFECV | Pipeline: FMS ===




=== Model: KNN | Feature Selection: RFECV | Pipeline: FMS ===
=== Model: DT | Feature Selection: RFECV | Pipeline: FMS ===
=== Model: RF | Feature Selection: RFECV | Pipeline: FMS ===
=== Model: AB | Feature Selection: RFECV | Pipeline: FMS ===
=== Model: XGB | Feature Selection: RFECV | Pipeline: FMS ===
=== Model: NB | Feature Selection: RFECV | Pipeline: FMS ===
=== Model: LR | Feature Selection: Lasso_RFECV | Pipeline: FSS ===
=== Model: NN | Feature Selection: Lasso_RFECV | Pipeline: FSS ===




=== Model: KNN | Feature Selection: Lasso_RFECV | Pipeline: FSS ===
=== Model: DT | Feature Selection: Lasso_RFECV | Pipeline: FSS ===
=== Model: RF | Feature Selection: Lasso_RFECV | Pipeline: FSS ===
=== Model: AB | Feature Selection: Lasso_RFECV | Pipeline: FSS ===
=== Model: XGB | Feature Selection: Lasso_RFECV | Pipeline: FSS ===
=== Model: NB | Feature Selection: Lasso_RFECV | Pipeline: FSS ===
=== Model: LR | Feature Selection: Lasso_RFECV | Pipeline: FMS ===
=== Model: NN | Feature Selection: Lasso_RFECV | Pipeline: FMS ===




=== Model: KNN | Feature Selection: Lasso_RFECV | Pipeline: FMS ===
=== Model: DT | Feature Selection: Lasso_RFECV | Pipeline: FMS ===
=== Model: RF | Feature Selection: Lasso_RFECV | Pipeline: FMS ===
=== Model: AB | Feature Selection: Lasso_RFECV | Pipeline: FMS ===
=== Model: XGB | Feature Selection: Lasso_RFECV | Pipeline: FMS ===
=== Model: NB | Feature Selection: Lasso_RFECV | Pipeline: FMS ===
All models evaluated and results saved.


### Run Pipeline for SS (No Feature Selection Methods)

In [None]:
import pandas as pd
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_predict
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    balanced_accuracy_score, matthews_corrcoef, roc_auc_score, precision_recall_curve, auc,
    make_scorer
)
import numpy as np
from SmoteTransformer import SMOTETransformer

# Define models
models = {
    "LR": LogisticRegression(),
    "NN": MLPClassifier(),
    "KNN": KNeighborsClassifier(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "AB": AdaBoostClassifier(),
    "XGB": XGBClassifier(),
    "NB": GaussianNB(),
}

# Stratified K-Fold setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store results
results = []
prob_df = pd.DataFrame()

# Preprocess dataset for SS
X_train_SS = X_train.copy()
X_val_SS = X_val.copy()

# Convert date column to UNIX timestamp
X_train_SS['date'] = pd.to_datetime(X_train_SS['date']).astype('int64') // 10**9
X_val_SS['date'] = pd.to_datetime(X_val_SS['date']).astype('int64') // 10**9

# Drop unnecessary columns
X_train_SS = X_train_SS.drop(columns=["user_id", "prod_id", "review"], errors='ignore')
X_val_SS = X_val_SS.drop(columns=["user_id", "prod_id", "review"], errors='ignore')

# Run SS pipeline for all models
for name, model in models.items():
    print(f"=== Model: {name} | Pipeline: SS ===")

    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Standard Scaling
        ('smote', SMOTETransformer(sampling_strategy='auto', random_state=42)),  # SMOTE
        ('classifier', model)  # Model
    ])

    # Define scoring metrics
    scoring = {
        'accuracy': 'accuracy',
        'precision': 'precision',
        'recall': 'recall',
        'f1': 'f1',
        'balanced_accuracy': make_scorer(balanced_accuracy_score),
        'mcc': make_scorer(matthews_corrcoef)
    }

    # Perform cross-validation
    cv_results = cross_validate(pipeline, X_train_SS, y_train, cv=cv, scoring=scoring, return_train_score=False)

    # Get predicted probabilities for PR-AUC & ROC-AUC
    y_pred_proba = cross_val_predict(pipeline, X_train_SS, y_train, cv=cv, method="predict_proba")

    # Compute additional metrics
    roc_auc_0 = roc_auc_score(y_train, y_pred_proba[:, 0])
    roc_auc_1 = roc_auc_score(y_train, y_pred_proba[:, 1])

    precision_0, recall_0, _ = precision_recall_curve(y_train, y_pred_proba[:, 0])
    pr_auc_0 = auc(recall_0, precision_0)

    precision_1, recall_1, _ = precision_recall_curve(y_train, y_pred_proba[:, 1])
    pr_auc_1 = auc(recall_1, precision_1)

    # Store metrics
    results.append({
        "Model": f"{name}_SS",
        "Accuracy": f"{np.mean(cv_results['test_accuracy']):.4f}",
        "Precision": f"{np.mean(cv_results['test_precision']):.4f}",
        "Recall": f"{np.mean(cv_results['test_recall']):.4f}",
        "F1 Score": f"{np.mean(cv_results['test_f1']):.4f}",
        "Balanced Accuracy": f"{np.mean(cv_results['test_balanced_accuracy']):.4f}",
        "MCC": f"{np.mean(cv_results['test_mcc']):.4f}",
        "PR-AUC_0": f"{pr_auc_0:.4f}",
        "PR-AUC_1": f"{pr_auc_1:.4f}",
        "ROC-AUC_0": f"{roc_auc_0:.4f}",
        "ROC-AUC_1": f"{roc_auc_1:.4f}"
    })

# Save results in a single CSV
results_df = pd.DataFrame(results)
results_df.to_csv("b4_pipeline_comparison_results_SS.csv", index=False)

print("SS pipeline models evaluated and results saved.")


=== Model: LR | Pipeline: SS ===
=== Model: NN | Pipeline: SS ===
=== Model: KNN | Pipeline: SS ===
=== Model: DT | Pipeline: SS ===
=== Model: RF | Pipeline: SS ===
=== Model: AB | Pipeline: SS ===
=== Model: XGB | Pipeline: SS ===
=== Model: NB | Pipeline: SS ===
SS pipeline models evaluated and results saved.
