In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.metrics import balanced_accuracy_score, matthews_corrcoef, precision_recall_curve, auc

# Import Dataset

In [22]:
df = pd.read_csv('../00_dataset/YelpZip/metadata', 
                 sep='\t',
                 header=None,
                 names=["user_id", "prod_id", "rating", "label", "date"])
df

Unnamed: 0,user_id,prod_id,rating,label,date
0,5044,0,1.0,-1,2014-11-16
1,5045,0,1.0,-1,2014-09-08
2,5046,0,3.0,-1,2013-10-06
3,5047,0,5.0,-1,2014-11-30
4,5048,0,5.0,-1,2014-08-28
...,...,...,...,...,...
608593,119664,5039,4.0,1,2013-01-20
608594,56277,5039,2.0,1,2012-11-12
608595,265320,5039,1.0,1,2012-08-22
608596,161722,5039,4.0,1,2011-05-11


# Dataset Pre-processing
Minimal cleanup because feature engineering will be using and converting date values.

In [23]:
# Check for Null values
df.isnull().sum()

user_id    0
prod_id    0
rating     0
label      0
date       0
dtype: int64

In [24]:
df['label'] = df['label'].replace(-1, 0)
y = df['label']
X = df.drop('label', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [25]:
print(f"""
X_train: {X_train.shape}
X_test: {X_test.shape}
y_train: {y_train.shape}
y_test: {y_test.shape}
""")


X_train: (426018, 4)
X_test: (182580, 4)
y_train: (426018,)
y_test: (182580,)



# Define Models

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier  # Boosting
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier

models = {
    "LR": LogisticRegression(),
    "NN": MLPClassifier(),
    "KNN": KNeighborsClassifier(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "AB": AdaBoostClassifier(),
    "XGB": XGBClassifier(),
    "NB": GaussianNB(),
}

# Example usage: print model names
for category, model in models.items():
    print(f"{category}: {model.__class__.__name__}")

LR: LogisticRegression
NN: MLPClassifier
KNN: KNeighborsClassifier
DT: DecisionTreeClassifier
RF: RandomForestClassifier
AB: AdaBoostClassifier
XGB: XGBClassifier
NB: GaussianNB


# Create pipeline

In [27]:
# X_train['date'] = pd.to_datetime(X_train['date']).astype('int64') // 10**9
# X_test['date'] = pd.to_datetime(X_test['date']).astype('int64') // 10**9
# X_train = X_train.drop(columns=["user_id", "prod_id"])
# X_test = X_test.drop(columns=["user_id", "prod_id"])

results = []


In [None]:
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from FeatureEngineer import CombinedEngineer
from SmoteTransformer import SMOTETransformer

for name, model in models.items():
    print(f"=== Model: {name} ===")
    # Define the pipeline
    pipeline = Pipeline([
        ('feature_engineering', CombinedEngineer(drop_columns=[])),  # drop_columns = [] or None for no features dropped
        ('scaler', StandardScaler()),  # scaling
        ('smote', SMOTETransformer(sampling_strategy='auto', random_state=42)),  # SMOTE
        ('classifier', model)  # Classifier
    ])

    # Fit and evaluate the model
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    print(f"Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")

    # Balanced accuracy
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    print(f"Balanced Accuracy: {balanced_acc:.3f}")

    # MCC
    mcc = matthews_corrcoef(y_test, y_pred)
    
    # PR-AUC
    y_pred_proba = pipeline.predict_proba(X_test)


    # ROC-AUC for both classes (Class 0 and Class 1)
    roc_auc_0 = roc_auc_score(y_test, y_pred_proba[:, 0])  # ROC-AUC for Class 0 (minority class)
    roc_auc_1 = roc_auc_score(y_test, y_pred_proba[:, 1])  # ROC-AUC for Class 1 (majority class)

    # PR-AUC for both classes
    precision_0, recall_0, _ = precision_recall_curve(y_test, y_pred_proba[:, 0])  # For Class 0
    pr_auc_0 = auc(recall_0, precision_0)

    precision_1, recall_1, _ = precision_recall_curve(y_test, y_pred_proba[:, 1])  # For Class 1
    pr_auc_1 = auc(recall_1, precision_1)

    # print(f"Balanced Accuracy: {balanced_acc:.3f}")
    # print(f"Matthews Correlation Coefficient: {mcc:.3f}")
    # print(f"PR-AUC for class 0: {pr_auc_0:.3f}")
    # print(f"PR-AUC for class 1: {pr_auc_1:.3f}")

    results.append({
        "Model": f"{name}_FESS",
        "Accuracy": f"{accuracy:.4f}",
        "Precision": f"{precision:.4f}",
        "Recall": f"{recall:.4f}",
        "F1 Score": f"{f1:.4f}",
        "Balanced Accuracy": f"{balanced_acc:.4f}",
        "MCC": f"{mcc:.4f}",
        "PR-AUC_0": f"{pr_auc_0:.4f}",
        "PR-AUC_1": f"{pr_auc_1:.4f}",
        "ROC-AUC_0": f"{roc_auc_0:.4f}",
        "ROC-AUC_1": f"{roc_auc_1:.4f}"
    })

=== Model: LR ===
Accuracy: 0.557, Precision: 0.972, Recall: 0.505, F1: 0.664
Balanced Accuracy: 0.705
=== Model: NN ===
Accuracy: 0.692, Precision: 0.920, Recall: 0.707, F1: 0.799
Balanced Accuracy: 0.653
=== Model: KNN ===
Accuracy: 0.729, Precision: 0.898, Recall: 0.777, F1: 0.833
Balanced Accuracy: 0.598
=== Model: DT ===
Accuracy: 0.792, Precision: 0.885, Recall: 0.874, F1: 0.880
Balanced Accuracy: 0.565
=== Model: RF ===
Accuracy: 0.837, Precision: 0.880, Recall: 0.940, F1: 0.909
Balanced Accuracy: 0.549
=== Model: AB ===
Accuracy: 0.724, Precision: 0.894, Recall: 0.773, F1: 0.829
Balanced Accuracy: 0.587
=== Model: XGB ===
Accuracy: 0.626, Precision: 0.972, Recall: 0.586, F1: 0.731
Balanced Accuracy: 0.738
=== Model: NB ===
Accuracy: 0.578, Precision: 0.959, Recall: 0.537, F1: 0.689
Balanced Accuracy: 0.693


In [29]:
# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Sort the results by "Model" alphabetically
results_df = results_df.sort_values(by="Model")

# Display the results table
print("\n=== Model Evaluation Results ===")
print(results_df)


=== Model Evaluation Results ===
      Model Accuracy Precision  Recall F1 Score Balanced Accuracy     MCC  \
5   AB_FESS   0.7240    0.8944  0.7733   0.8294            0.5870  0.1361   
3   DT_FESS   0.7923    0.8852  0.8740   0.8796            0.5651  0.1258   
2  KNN_FESS   0.7294    0.8976  0.7768   0.8328            0.5975  0.1528   
0   LR_FESS   0.5575    0.9720  0.5046   0.6643            0.7045  0.2785   
7   NB_FESS   0.5784    0.9588  0.5372   0.6886            0.6929  0.2615   
1   NN_FESS   0.6923    0.9205  0.7065   0.7994            0.6529  0.2196   
4   RF_FESS   0.8370    0.8800  0.9405   0.9092            0.5492  0.1285   
6  XGB_FESS   0.6260    0.9722  0.5857   0.7310            0.7380  0.3228   

  PR-AUC_0 PR-AUC_1 ROC-AUC_0 ROC-AUC_1  
5   0.8055   0.9514    0.2743    0.7257  
3   0.8240   0.9343    0.4350    0.5650  
2   0.8165   0.9343    0.3565    0.6435  
0   0.7851   0.9550    0.2463    0.7537  
7   0.7900   0.9524    0.2589    0.7411  
1   0.8034   0.9482 

In [30]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Balanced Accuracy,MCC,PR-AUC_0,PR-AUC_1,ROC-AUC_0,ROC-AUC_1
5,AB_FESS,0.724,0.8944,0.7733,0.8294,0.587,0.1361,0.8055,0.9514,0.2743,0.7257
3,DT_FESS,0.7923,0.8852,0.874,0.8796,0.5651,0.1258,0.824,0.9343,0.435,0.565
2,KNN_FESS,0.7294,0.8976,0.7768,0.8328,0.5975,0.1528,0.8165,0.9343,0.3565,0.6435
0,LR_FESS,0.5575,0.972,0.5046,0.6643,0.7045,0.2785,0.7851,0.955,0.2463,0.7537
7,NB_FESS,0.5784,0.9588,0.5372,0.6886,0.6929,0.2615,0.79,0.9524,0.2589,0.7411
1,NN_FESS,0.6923,0.9205,0.7065,0.7994,0.6529,0.2196,0.8034,0.9482,0.2761,0.7239
4,RF_FESS,0.837,0.88,0.9405,0.9092,0.5492,0.1285,0.779,0.9603,0.2297,0.7703
6,XGB_FESS,0.626,0.9722,0.5857,0.731,0.738,0.3228,0.7641,0.9656,0.1984,0.8016


In [31]:
results_df.to_csv("b2_results_RFE.csv")

In [None]:
# 1. Access individual steps in the pipeline
# feature_engineering_step = pipeline.named_steps['feature_engineering']
scaler_step = pipeline.named_steps['scaler']
smote_step = pipeline.named_steps['smote']
classifier_step = pipeline.named_steps['classifier']

X_engineered = X_train
# 2. Check the transformed data at each step
# For training data
# X_engineered = feature_engineering_step.transform(X_train)
X_scaled = scaler_step.transform(X_engineered)
X_resampled, y_resampled = smote_step.fit_resample(X_scaled, y_train)

# 3. Check shapes to understand SMOTE's effect
print(f"Original X_train shape: {X_train.shape}")
print(f"After engineering: {X_engineered.shape}")
print(f"After scaling: {X_scaled.shape}")
print(f"After SMOTE: {X_resampled.shape}, y_resampled: {y_resampled.shape}")

# 4. Compare class distributions
from collections import Counter
print(f"Original class distribution: {Counter(y_train)}")
print(f"Resampled class distribution: {Counter(y_resampled)}")

# 5. Extract feature importances (if classifier supports it)
if hasattr(classifier_step, 'feature_importances_'):
    importances = classifier_step.feature_importances_
    print(importances)

    # Print top features
elif hasattr(classifier_step, 'coef_'):
    importances = classifier_step.coef_[0]

In [None]:
# checking versions (sync with kaggle if using)
import sys
import numpy as np
import pandas as pd
import sklearn

print("Python Version:", sys.version)
print("NumPy Version:", np.__version__)
print("Pandas Version:", pd.__version__)
print("Scikit-Learn Version:", sklearn.__version__)

: 