In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.metrics import balanced_accuracy_score, matthews_corrcoef, precision_recall_curve, auc

# Import Dataset

In [16]:
df = pd.read_csv('../00_dataset/YelpZip/metadata', 
                 sep='\t',
                 header=None,
                 names=["user_id", "prod_id", "rating", "label", "date"])
df

Unnamed: 0,user_id,prod_id,rating,label,date
0,5044,0,1.0,-1,2014-11-16
1,5045,0,1.0,-1,2014-09-08
2,5046,0,3.0,-1,2013-10-06
3,5047,0,5.0,-1,2014-11-30
4,5048,0,5.0,-1,2014-08-28
...,...,...,...,...,...
608593,119664,5039,4.0,1,2013-01-20
608594,56277,5039,2.0,1,2012-11-12
608595,265320,5039,1.0,1,2012-08-22
608596,161722,5039,4.0,1,2011-05-11


# Dataset Pre-processing
Minimal cleanup because feature engineering will be using and converting date values.

In [17]:
# Check for Null values
df.isnull().sum()

user_id    0
prod_id    0
rating     0
label      0
date       0
dtype: int64

In [18]:
df['label'] = df['label'].replace(-1, 0)
y = df['label']
X = df.drop('label', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [19]:
print(f"""
X_train: {X_train.shape}
X_test: {X_test.shape}
y_train: {y_train.shape}
y_test: {y_test.shape}
""")


X_train: (426018, 4)
X_test: (182580, 4)
y_train: (426018,)
y_test: (182580,)



# Define Models

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier  # Boosting
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier

models = {
    "LR": LogisticRegression(),
    "NN": MLPClassifier(),
    "KNN": KNeighborsClassifier(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "AB": AdaBoostClassifier(),
    "XGB": XGBClassifier(),
    "NB": GaussianNB(),
}

# Example usage: print model names
for category, model in models.items():
    print(f"{category}: {model.__class__.__name__}")

LR: LogisticRegression
NN: MLPClassifier
KNN: KNeighborsClassifier
DT: DecisionTreeClassifier
RF: RandomForestClassifier
AB: AdaBoostClassifier
XGB: XGBClassifier
NB: GaussianNB


# Create pipeline

In [None]:
# X_train['date'] = pd.to_datetime(X_train['date']).astype('int64') // 10**9
# X_test['date'] = pd.to_datetime(X_test['date']).astype('int64') // 10**9
# X_train = X_train.drop(columns=["user_id", "prod_id"])
# X_test = X_test.drop(columns=["user_id", "prod_id"])

results = []


In [22]:
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from FeatureEngineer import CombinedEngineer
from SmoteTransformer import SMOTETransformer

for name, model in models.items():
    print(f"=== Model: {name} ===")
    # Define the pipeline
    pipeline = Pipeline([
        ('feature_engineering', CombinedEngineer()),  # Feature engineering
        ('scaler', StandardScaler()),  # scaling
        ('smote', SMOTETransformer(sampling_strategy='auto', random_state=42)),  # SMOTE
        ('classifier', model)  # Classifier
    ])

    # Fit and evaluate the model
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    print(f"Accuracy: {accuracy:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")

    # Balanced accuracy
    balanced_acc = balanced_accuracy_score(y_test, y_pred)
    print(f"Balanced Accuracy: {balanced_acc:.3f}")

    # MCC
    mcc = matthews_corrcoef(y_test, y_pred)
    
    # PR-AUC
    y_pred_proba = pipeline.predict_proba(X_test)


    # ROC-AUC for both classes (Class 0 and Class 1)
    roc_auc_0 = roc_auc_score(y_test, y_pred_proba[:, 0])  # ROC-AUC for Class 0 (minority class)
    roc_auc_1 = roc_auc_score(y_test, y_pred_proba[:, 1])  # ROC-AUC for Class 1 (majority class)

    # PR-AUC for both classes
    precision_0, recall_0, _ = precision_recall_curve(y_test, y_pred_proba[:, 0])  # For Class 0
    pr_auc_0 = auc(recall_0, precision_0)

    precision_1, recall_1, _ = precision_recall_curve(y_test, y_pred_proba[:, 1])  # For Class 1
    pr_auc_1 = auc(recall_1, precision_1)

    # print(f"Balanced Accuracy: {balanced_acc:.3f}")
    # print(f"Matthews Correlation Coefficient: {mcc:.3f}")
    # print(f"PR-AUC for class 0: {pr_auc_0:.3f}")
    # print(f"PR-AUC for class 1: {pr_auc_1:.3f}")

    results.append({
        "Model": f"{name}_FESS",
        "Accuracy": f"{accuracy:.4f}",
        "Precision": f"{precision:.4f}",
        "Recall": f"{recall:.4f}",
        "F1 Score": f"{f1:.4f}",
        "Balanced Accuracy": f"{balanced_acc:.4f}",
        "MCC": f"{mcc:.4f}",
        "PR-AUC_0": f"{pr_auc_0:.4f}",
        "PR-AUC_1": f"{pr_auc_1:.4f}",
        "ROC-AUC_0": f"{roc_auc_0:.4f}",
        "ROC-AUC_1": f"{roc_auc_1:.4f}"
    })

=== Model: LR ===
Accuracy: 0.655, Precision: 0.937, Recall: 0.645, F1: 0.764
Balanced Accuracy: 0.681
=== Model: NN ===




Accuracy: 0.802, Precision: 0.874, Recall: 0.902, F1: 0.888
Balanced Accuracy: 0.524
=== Model: KNN ===
Accuracy: 0.749, Precision: 0.891, Recall: 0.810, F1: 0.848
Balanced Accuracy: 0.579
=== Model: DT ===
Accuracy: 0.767, Precision: 0.898, Recall: 0.825, F1: 0.860
Balanced Accuracy: 0.605
=== Model: RF ===
Accuracy: 0.840, Precision: 0.887, Recall: 0.936, F1: 0.910
Balanced Accuracy: 0.575
=== Model: AB ===




Accuracy: 0.734, Precision: 0.873, Recall: 0.811, F1: 0.841
Balanced Accuracy: 0.517
=== Model: XGB ===
Accuracy: 0.667, Precision: 0.960, Recall: 0.643, F1: 0.770
Balanced Accuracy: 0.734
=== Model: NB ===
Accuracy: 0.549, Precision: 0.977, Recall: 0.491, F1: 0.654
Balanced Accuracy: 0.708


In [23]:
# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Sort the results by "Model" alphabetically
results_df = results_df.sort_values(by="Model")

# Display the results table
print("\n=== Model Evaluation Results ===")
print(results_df)


=== Model Evaluation Results ===
           Model Accuracy Precision  Recall F1 Score Balanced Accuracy  \
21       AB_FESS   0.7335    0.8726  0.8114   0.8409            0.5169   
13   AB_no_smote   0.8678    0.8678  1.0000   0.9292            0.5000   
5       AB_smote   0.5717    0.9085  0.5631   0.6953            0.5954   
19       DT_FESS   0.7668    0.8981  0.8248   0.8599            0.6054   
11   DT_no_smote   0.8643    0.8704  0.9913   0.9269            0.5111   
3       DT_smote   0.6272    0.8925  0.6485   0.7512            0.5680   
18      KNN_FESS   0.7487    0.8908  0.8096   0.8483            0.5791   
10  KNN_no_smote   0.8534    0.8710  0.9756   0.9203            0.5135   
2      KNN_smote   0.8496    0.8718  0.9692   0.9179            0.5169   
16       LR_FESS   0.6546    0.9375  0.6450   0.7642            0.6813   
8    LR_no_smote   0.8678    0.8678  1.0000   0.9292            0.5000   
0       LR_smote   0.5602    0.8667  0.5828   0.6969            0.4973   
23  

In [24]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Balanced Accuracy,MCC,PR-AUC_0,PR-AUC_1,ROC-AUC_0,ROC-AUC_1
21,AB_FESS,0.7335,0.8726,0.8114,0.8409,0.5169,0.0289,0.8266,0.9354,0.3457,0.6543
13,AB_no_smote,0.8678,0.8678,1.0,0.9292,0.5,0.0,0.8129,0.9086,0.3707,0.6293
5,AB_smote,0.5717,0.9085,0.5631,0.6953,0.5954,0.1296,0.8131,0.9087,0.3705,0.6295
19,DT_FESS,0.7668,0.8981,0.8248,0.8599,0.6054,0.1776,0.8197,0.9375,0.3945,0.6055
11,DT_no_smote,0.8643,0.8704,0.9913,0.9269,0.5111,0.07,0.823,0.9019,0.3966,0.6034
3,DT_smote,0.6272,0.8925,0.6485,0.7512,0.568,0.0954,0.8275,0.8987,0.4074,0.5926
18,KNN_FESS,0.7487,0.8908,0.8096,0.8483,0.5791,0.1313,0.819,0.932,0.3762,0.6238
10,KNN_no_smote,0.8534,0.871,0.9756,0.9203,0.5135,0.0557,0.8372,0.9136,0.4455,0.5545
2,KNN_smote,0.8496,0.8718,0.9692,0.9179,0.5169,0.0621,0.838,0.9124,0.448,0.552
16,LR_FESS,0.6546,0.9375,0.645,0.7642,0.6813,0.2505,0.7968,0.9531,0.2626,0.7374


In [25]:
results_df.to_csv("b2_results.csv")

In [11]:
# 1. Access individual steps in the pipeline
# feature_engineering_step = pipeline.named_steps['feature_engineering']
scaler_step = pipeline.named_steps['scaler']
smote_step = pipeline.named_steps['smote']
classifier_step = pipeline.named_steps['classifier']

X_engineered = X_train
# 2. Check the transformed data at each step
# For training data
# X_engineered = feature_engineering_step.transform(X_train)
X_scaled = scaler_step.transform(X_engineered)
X_resampled, y_resampled = smote_step.fit_resample(X_scaled, y_train)

# 3. Check shapes to understand SMOTE's effect
print(f"Original X_train shape: {X_train.shape}")
print(f"After engineering: {X_engineered.shape}")
print(f"After scaling: {X_scaled.shape}")
print(f"After SMOTE: {X_resampled.shape}, y_resampled: {y_resampled.shape}")

# 4. Compare class distributions
from collections import Counter
print(f"Original class distribution: {Counter(y_train)}")
print(f"Resampled class distribution: {Counter(y_resampled)}")

# 5. Extract feature importances (if classifier supports it)
if hasattr(classifier_step, 'feature_importances_'):
    importances = classifier_step.feature_importances_
    print(importances)

    # Print top features
elif hasattr(classifier_step, 'coef_'):
    importances = classifier_step.coef_[0]

Original X_train shape: (426018, 2)
After engineering: (426018, 2)
After scaling: (426018, 2)
After SMOTE: (739384, 2), y_resampled: (739384,)
Original class distribution: Counter({1: 369692, 0: 56326})
Resampled class distribution: Counter({0: 369692, 1: 369692})


In [12]:
# checking versions (sync with kaggle if using)
import sys
import numpy as np
import pandas as pd
import sklearn

print("Python Version:", sys.version)
print("NumPy Version:", np.__version__)
print("Pandas Version:", pd.__version__)
print("Scikit-Learn Version:", sklearn.__version__)

Python Version: 3.12.0 (tags/v3.12.0:0fb18b0, Oct  2 2023, 13:03:39) [MSC v.1935 64 bit (AMD64)]
NumPy Version: 1.26.3
Pandas Version: 2.2.1
Scikit-Learn Version: 1.5.0
