In [1]:
# Data Manipulation
import pandas as pd
import numpy as np
import pyarrow.parquet

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (accuracy_score, classification_report, roc_auc_score, 
                             confusion_matrix, ConfusionMatrixDisplay, matthews_corrcoef, 
                             balanced_accuracy_score, precision_recall_curve, auc)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Load Dataset

In [2]:
df_train = pd.read_parquet("00_dataset/without_stopwords/tfidfWithNGram/train_features.parquet")
df_test = pd.read_parquet("00_dataset/without_stopwords/tfidfWithNGram/test_features.parquet")
df_val = pd.read_parquet("00_dataset/without_stopwords/tfidfWithNGram/val_features.parquet")

In [3]:
# Separate into X and y
X_train = df_train.drop(columns=["label"])
y_train = df_train["label"]

X_test = df_test.drop(columns=["label"])
y_test = df_test["label"]

X_val = df_val.drop(columns=["label"])
y_val = df_val["label"]

In [4]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4999,sentiment_score,vader_score,review_length,exclamation_count,question_count,uppercase_ratio,duplicate_word_count,emoji_count,avg_word_length
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.194444,0.9611,73,0,0,0.027708,23,0,4.452055
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.187037,0.9422,46,1,0,0.030534,12,0,4.717391
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.147253,0.7906,38,0,0,0.031579,9,0,4.026316
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.253842,0.9874,108,8,0,0.076923,41,0,4.185185
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.257143,0.8903,53,0,0,0.040134,21,0,4.660378


In [4]:
# Load Min Max Scaler
mms = joblib.load('00_dataset/without_stopwords/tfidfWithNGram/minmax_scaler.joblib')
X_train_scaled = mms.transform(X_train)
X_test_scaled = mms.transform(X_test)
X_val_scaled = mms.transform(X_val)

In [6]:
# Check class distribution after split
print("Train Class Distribution:\n", y_train.value_counts(normalize=True))
print("\nValidation Class Distribution:\n", y_val.value_counts(normalize=True))
print("\nTest Class Distribution:\n", y_test.value_counts(normalize=True))

Train Class Distribution:
 label
0    0.867799
1    0.132201
Name: proportion, dtype: float64

Validation Class Distribution:
 label
0    0.867797
1    0.132203
Name: proportion, dtype: float64

Test Class Distribution:
 label
0    0.867797
1    0.132203
Name: proportion, dtype: float64


In [6]:
X_train_scaled

array([[0.5972222 , 0.98063934, 0.06336806, ..., 0.        , 0.        ,
        0.        ],
       [0.5935185 , 0.97118413, 0.03993056, ..., 0.        , 0.        ,
        0.        ],
       [0.42637363, 0.89534247, 0.03298611, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.6029514 , 0.9971484 , 0.1467014 , ..., 0.        , 0.        ,
        0.        ],
       [0.73392856, 0.96453047, 0.0390625 , ..., 0.        , 0.        ,
        0.        ],
       [0.5621693 , 0.9788884 , 0.07986111, ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

In [7]:
print("Contains negative values:", np.any(X_train_scaled < 0))


Contains negative values: False


# Apply SMOTE

In [7]:
smote50 = SMOTE(sampling_strategy=0.50, random_state=42)
X_train_smote50, y_train_smote50 = smote50.fit_resample(X_train_scaled, y_train)

joblib.dump((X_train_smote50, y_train_smote50), "00_dataset/without_stopwords/tfidfWithNGram/smote50_resampled.joblib")
print("SMOTE50-transformed dataset saved successfully.")

SMOTE50-transformed dataset saved successfully.


In [5]:
# Load resampled dataset
X_train_smote50, y_train_smote50 = joblib.load("00_dataset/without_stopwords/tfidfWithNGram/smote50_resampled.joblib")

In [9]:
X_train_smote50.dtype

dtype('float32')

In [12]:
y_train.dtype

dtype('int16')

In [9]:
nb_smote50 = MultinomialNB()
nb_smote50.fit(X_train_smote50, y_train_smote50)

# Save the model
joblib.dump(nb_smote50, '00_dataset/without_stopwords/tfidfWithNGram/smote/nb_smote50.joblib')
print("Naive Bayes with SMOTE50 training completed and saved.")

Naive Bayes with SMOTE50 training completed and saved.


In [11]:
# Load the last saved model
nb_smote50 = joblib.load('00_dataset/without_stopwords/tfidfWithNGram/smote/nb_smote50.joblib')

# Predict on validation data
y_val_pred = nb_smote50.predict(X_val_scaled)
y_proba = nb_smote50.predict_proba(X_val_scaled)[:, 1]

# Evaluate performance
accuracy = accuracy_score(y_val, y_val_pred)
balanced_acc = balanced_accuracy_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, y_proba)
mcc = matthews_corrcoef(y_val, y_val_pred)
precision, recall, _ = precision_recall_curve(y_val, y_proba)
pr_auc = auc(recall, precision)

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Balanced Accuracy: {balanced_acc:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
print(f"Precision-Recall AUC (PR-AUC): {pr_auc:.4f}")

print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy: 0.8273
Balanced Accuracy: 0.5875
ROC-AUC Score: 0.7179
Matthews Correlation Coefficient (MCC): 0.1898
Precision-Recall AUC (PR-AUC): 0.2624

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.91      0.90     52802
           1       0.32      0.26      0.29      8044

    accuracy                           0.83     60846
   macro avg       0.60      0.59      0.59     60846
weighted avg       0.81      0.83      0.82     60846



In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

# Define Naïve Bayes model
nb = MultinomialNB()

# Define hyperparameter grid
param_grid = {'alpha': [0.01, 0.1, 0.5, 1, 5, 10]}

# Perform GridSearchCV
grid_search = GridSearchCV(nb, param_grid, cv=5, scoring='average_precision', n_jobs=1, verbose=2)
grid_search.fit(X_train_smote50, y_train_smote50)

# Best parameters
print("Best Alpha:", grid_search.best_params_)

# Evaluate on validation set
best_nb = grid_search.best_estimator_
y_val_pred = best_nb.predict(X_val_scaled)

# Performance Metrics
from sklearn.metrics import classification_report, roc_auc_score
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))
print("ROC-AUC Score: ", roc_auc_score(y_val, best_nb.predict_proba(X_val_scaled)[:, 1]))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END .........................................alpha=0.01; total time= 1.2min
[CV] END .........................................alpha=0.01; total time= 1.0min
[CV] END .........................................alpha=0.01; total time= 1.0min
[CV] END .........................................alpha=0.01; total time=  57.5s
[CV] END .........................................alpha=0.01; total time=  59.4s
[CV] END ..........................................alpha=0.1; total time= 1.1min
[CV] END ..........................................alpha=0.1; total time= 1.0min
[CV] END ..........................................alpha=0.1; total time= 1.0min
[CV] END ..........................................alpha=0.1; total time= 1.0min
[CV] END ..........................................alpha=0.1; total time= 1.0min
[CV] END ..........................................alpha=0.5; total time= 1.0min
[CV] END ........................................

13 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python312\Lib\site-packages\sklearn\naive_bayes.py", line 759, in fit
    self._count(X, Y)
  File "c:\Python312\Lib\site-packages\sklearn\naive_bayes.py", line 882, in _count
    self.feature_count_ += safe_sparse_dot(Y.T, X)
           

Best Alpha: {'alpha': 0.01}

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.91      0.90     52802
           1       0.31      0.26      0.29      8044

    accuracy                           0.83     60846
   macro avg       0.60      0.59      0.59     60846
weighted avg       0.81      0.83      0.82     60846

ROC-AUC Score:  0.7177423295016683


In [14]:
# Define the model
nb = MultinomialNB()

# Fine-tuned alpha values (smaller steps)
param_grid = {'alpha': [0.001, 0.005, 0.01]}

# Perform GridSearchCV
grid_search = GridSearchCV(nb, param_grid, cv=5, scoring='average_precision', n_jobs=1, verbose=2)
grid_search.fit(X_train_smote50, y_train_smote50)

# Best parameters
print("Best Alpha:", grid_search.best_params_)

# Evaluate on validation set
best_nb = grid_search.best_estimator_
y_val_pred = best_nb.predict(X_val_scaled)

# Performance Metrics
y_proba = best_nb.predict_proba(X_val_scaled)[:, 1]

print("\nClassification Report:\n", classification_report(y_val, y_val_pred))
print("ROC-AUC Score:", roc_auc_score(y_val, y_proba))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ........................................alpha=0.001; total time=  11.3s
[CV] END ........................................alpha=0.001; total time=  12.9s
[CV] END ........................................alpha=0.001; total time=   7.5s
[CV] END ........................................alpha=0.001; total time=   8.0s
[CV] END ........................................alpha=0.001; total time=   7.7s
[CV] END ........................................alpha=0.005; total time=   7.9s
[CV] END ........................................alpha=0.005; total time=   6.1s
[CV] END ........................................alpha=0.005; total time=   5.6s
[CV] END ........................................alpha=0.005; total time=   6.9s
[CV] END ........................................alpha=0.005; total time=   9.6s
[CV] END .........................................alpha=0.01; total time=   7.4s
[CV] END ........................................

ValueError: 
All the 15 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python312\Lib\site-packages\sklearn\naive_bayes.py", line 759, in fit
    self._count(X, Y)
  File "c:\Python312\Lib\site-packages\sklearn\naive_bayes.py", line 882, in _count
    self.feature_count_ += safe_sparse_dot(Y.T, X)
                           ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python312\Lib\site-packages\sklearn\utils\extmath.py", line 208, in safe_sparse_dot
    ret = a @ b
          ~~^~~
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 16.6 GiB for an array with shape (443535, 5009) and data type float64

--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python312\Lib\site-packages\sklearn\naive_bayes.py", line 759, in fit
    self._count(X, Y)
  File "c:\Python312\Lib\site-packages\sklearn\naive_bayes.py", line 882, in _count
    self.feature_count_ += safe_sparse_dot(Y.T, X)
                           ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Python312\Lib\site-packages\sklearn\utils\extmath.py", line 208, in safe_sparse_dot
    ret = a @ b
          ~~^~~
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 16.6 GiB for an array with shape (443536, 5009) and data type float64


In [8]:
nb = MultinomialNB(alpha=0.01)

# Fit the model
nb.fit(X_train_smote50, y_train_smote50)

# Save the model
joblib.dump(nb, '00_dataset/without_stopwords/tfidfWithNGram/smote/nb_smote50_tuned.joblib')
print("Naïve Bayes with SMOTE50 and tuned hyperparameters training completed and saved.")

Naïve Bayes with SMOTE50 and tuned hyperparameters training completed and saved.


In [7]:
# Predict on validation data
y_val_pred = nb.predict(X_val_scaled)
y_proba = nb.predict_proba(X_val_scaled)[:, 1]

# Evaluate performance
accuracy = accuracy_score(y_val, y_val_pred)
balanced_acc = balanced_accuracy_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, y_proba)
mcc = matthews_corrcoef(y_val, y_val_pred)
precision, recall, _ = precision_recall_curve(y_val, y_proba)
pr_auc = auc(recall, precision)

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Balanced Accuracy: {balanced_acc:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
print(f"Precision-Recall AUC (PR-AUC): {pr_auc:.4f}")

print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy: 0.8270
Balanced Accuracy: 0.5877
ROC-AUC Score: 0.7177
Matthews Correlation Coefficient (MCC): 0.1898
Precision-Recall AUC (PR-AUC): 0.2623

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.91      0.90     52802
           1       0.31      0.26      0.29      8044

    accuracy                           0.83     60846
   macro avg       0.60      0.59      0.59     60846
weighted avg       0.81      0.83      0.82     60846



In [15]:
nb = MultinomialNB(alpha=0.001)

# Fit the model
nb.fit(X_train_smote50, y_train_smote50)

# Predict on validation data
y_val_pred = nb.predict(X_val_scaled)
y_proba = nb.predict_proba(X_val_scaled)[:, 1]

# Evaluate performance
accuracy = accuracy_score(y_val, y_val_pred)
balanced_acc = balanced_accuracy_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, y_proba)
mcc = matthews_corrcoef(y_val, y_val_pred)
precision, recall, _ = precision_recall_curve(y_val, y_proba)
pr_auc = auc(recall, precision)

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Balanced Accuracy: {balanced_acc:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
print(f"Precision-Recall AUC (PR-AUC): {pr_auc:.4f}")

print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy: 0.8270
Balanced Accuracy: 0.5877
ROC-AUC Score: 0.7177
Matthews Correlation Coefficient (MCC): 0.1898
Precision-Recall AUC (PR-AUC): 0.2623

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.91      0.90     52802
           1       0.31      0.26      0.29      8044

    accuracy                           0.83     60846
   macro avg       0.60      0.59      0.59     60846
weighted avg       0.81      0.83      0.82     60846



In [6]:
nb = MultinomialNB(alpha=0.005)

# Fit the model
nb.fit(X_train_smote50, y_train_smote50)

In [7]:
# Predict on validation data
y_val_pred = nb.predict(X_val_scaled)
y_proba = nb.predict_proba(X_val_scaled)[:, 1]

# Evaluate performance
accuracy = accuracy_score(y_val, y_val_pred)
balanced_acc = balanced_accuracy_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, y_proba)
mcc = matthews_corrcoef(y_val, y_val_pred)
precision, recall, _ = precision_recall_curve(y_val, y_proba)
pr_auc = auc(recall, precision)

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Balanced Accuracy: {balanced_acc:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
print(f"Precision-Recall AUC (PR-AUC): {pr_auc:.4f}")

print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

Validation Accuracy: 0.8270
Balanced Accuracy: 0.5877
ROC-AUC Score: 0.7177
Matthews Correlation Coefficient (MCC): 0.1898
Precision-Recall AUC (PR-AUC): 0.2623

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.91      0.90     52802
           1       0.31      0.26      0.29      8044

    accuracy                           0.83     60846
   macro avg       0.60      0.59      0.59     60846
weighted avg       0.81      0.83      0.82     60846



Final Model

In [9]:
# Load the last saved model
best_nb= joblib.load('00_dataset/without_stopwords/tfidfWithNGram/smote/nb_smote50_tuned.joblib')

# Predict probabilities on test data
y_test_proba = best_nb.predict_proba(X_test_scaled)  # Returns probabilities for both classes

# Extract probabilities for each class
prob_class_0 = y_test_proba[:, 0]  # Probability of being class 0 (not spam)
prob_class_1 = y_test_proba[:, 1]  # Probability of being class 1 (spam)

# Predict the actual class labels
y_test_pred = best_nb.predict(X_test_scaled)

In [11]:
# Assuming y_test is the actual labels of the test dataset
df_results = pd.DataFrame({
    'Actual Label': y_test,
    'Predicted Label': y_test_pred,
    'Prob (Class 0 - Not Spam)': prob_class_0,
    'Prob (Class 1 - Spam)': prob_class_1
})

In [12]:
# Print classification report
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))

# Compute ROC-AUC Score
roc_auc = roc_auc_score(y_test, prob_class_1)
print("ROC-AUC Score:", roc_auc)


Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.91      0.90    105604
           1       0.31      0.26      0.28     16088

    accuracy                           0.82    121692
   macro avg       0.60      0.59      0.59    121692
weighted avg       0.81      0.82      0.82    121692

ROC-AUC Score: 0.7172433843110835


In [13]:
print(df_results.head())

df_results.to_csv("test_predictions_with_probabilities.csv", index=False)

   Actual Label  Predicted Label  Prob (Class 0 - Not Spam)  \
0             0                0                   0.535518   
1             0                0                   0.625511   
2             1                0                   0.607732   
3             0                0                   0.888334   
4             0                0                   0.734379   

   Prob (Class 1 - Spam)  
0               0.464482  
1               0.374489  
2               0.392268  
3               0.111666  
4               0.265621  
