In [2]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv("../data/spambase_cleaned.csv")

# Features (X) and target (y)
X = df.drop(columns="is_spam")
y = df["is_spam"]


In [3]:
from sklearn.model_selection import train_test_split

# 80/20 split for training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [4]:
from sklearn.preprocessing import StandardScaler

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [5]:
from sklearn.pipeline import Pipeline

def make_pipeline(model):
    return Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", model)
    ])


In [6]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

# Build and train
gnb_pipeline = make_pipeline(GaussianNB())
gnb_pipeline.fit(X_train, y_train)

# Evaluate
y_pred_gnb = gnb_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_gnb))


              precision    recall  f1-score   support

           0       0.96      0.75      0.84       506
           1       0.71      0.95      0.82       336

    accuracy                           0.83       842
   macro avg       0.84      0.85      0.83       842
weighted avg       0.86      0.83      0.83       842



In [7]:
from sklearn.linear_model import LogisticRegression

# Build and train
lr_pipeline = make_pipeline(LogisticRegression(max_iter=1000, random_state=42))
lr_pipeline.fit(X_train, y_train)

# Evaluate
y_pred_lr = lr_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_lr))


              precision    recall  f1-score   support

           0       0.94      0.95      0.95       506
           1       0.93      0.90      0.92       336

    accuracy                           0.93       842
   macro avg       0.93      0.93      0.93       842
weighted avg       0.93      0.93      0.93       842



In [8]:
from sklearn.ensemble import RandomForestClassifier

# Build and train
rf_pipeline = make_pipeline(RandomForestClassifier(
    n_estimators=100, random_state=42, n_jobs=-1
))
rf_pipeline.fit(X_train, y_train)

# Evaluate
y_pred_rf = rf_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_rf))


              precision    recall  f1-score   support

           0       0.95      0.96      0.95       506
           1       0.94      0.93      0.93       336

    accuracy                           0.95       842
   macro avg       0.94      0.94      0.94       842
weighted avg       0.95      0.95      0.95       842



In [9]:
from sklearn.model_selection import GridSearchCV

# Grids for each classifier
param_grids = {
    'gnb': {
        # GaussianNB has no major hyperparameters to tune,
        # but you can try var_smoothing if desired:
        'classifier__var_smoothing': [1e-9, 1e-8, 1e-7]
    },
    'lr': {
        # LogisticRegression: vary regularization strength C and penalty
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs']
    },
    'rf': {
        # RandomForest: number of trees, max depth, and min samples leaf
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_leaf': [1, 2, 4]
    }
}


In [10]:
# Assuming you have make_pipeline(model) defined already:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Create pipelines
pipelines = {
    'gnb': make_pipeline(GaussianNB()),
    'lr': make_pipeline(LogisticRegression(max_iter=1000, random_state=42)),
    'rf': make_pipeline(RandomForestClassifier(random_state=42, n_jobs=-1))
}

# Store search objects
grid_searches = {}

for name in pipelines:
    gs = GridSearchCV(
        estimator=pipelines[name],
        param_grid=param_grids[name],
        scoring='f1',        # optimize for F1-score
        cv=5,                # 5-fold cross-validation
        n_jobs=-1,
        verbose=2
    )
    print(f"Running GridSearch for {name}...")
    gs.fit(X_train, y_train)
    grid_searches[name] = gs
    print(f"Best params for {name}: {gs.best_params_}")
    print(f"Best CV F1-score: {gs.best_score_:.3f}\n")


Running GridSearch for gnb...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ....................classifier__var_smoothing=1e-09; total time=   0.0s
[CV] END ....................classifier__var_smoothing=1e-09; total time=   0.0s
[CV] END ....................classifier__var_smoothing=1e-09; total time=   0.0s
[CV] END ....................classifier__var_smoothing=1e-08; total time=   0.0s
[CV] END ....................classifier__var_smoothing=1e-09; total time=   0.0s
[CV] END ....................classifier__var_smoothing=1e-08; total time=   0.0s
[CV] END ....................classifier__var_smoothing=1e-09; total time=   0.0s
[CV] END ....................classifier__var_smoothing=1e-08; total time=   0.0s
[CV] END ....................classifier__var_smoothing=1e-08; total time=   0.0s
[CV] END ....................classifier__var_smoothing=1e-07; total time=   0.0s
[CV] END ....................classifier__var_smoothing=1e-08; total time=   0.0s
[CV] END ..........

In [11]:
from sklearn.metrics import classification_report, confusion_matrix

for name, gs in grid_searches.items():
    print(f"=== {name.upper()} ===")
    best_model = gs.best_estimator_
    y_pred = best_model.predict(X_test)
    print("Classification Report:")
    print(classification_report(y_test, y_pred, digits=4))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\n")


=== GNB ===
Classification Report:
              precision    recall  f1-score   support

           0     0.9594    0.7470    0.8400       506
           1     0.7143    0.9524    0.8163       336

    accuracy                         0.8290       842
   macro avg     0.8368    0.8497    0.8282       842
weighted avg     0.8616    0.8290    0.8306       842

Confusion Matrix:
[[378 128]
 [ 16 320]]


=== LR ===
Classification Report:
              precision    recall  f1-score   support

           0     0.9436    0.9585    0.9510       506
           1     0.9360    0.9137    0.9247       336

    accuracy                         0.9406       842
   macro avg     0.9398    0.9361    0.9378       842
weighted avg     0.9405    0.9406    0.9405       842

Confusion Matrix:
[[485  21]
 [ 29 307]]


=== RF ===
Classification Report:
              precision    recall  f1-score   support

           0     0.9491    0.9585    0.9538       506
           1     0.9366    0.9226    0.9295     

In [12]:
import joblib

# 1. Gather best CV F1-scores
best_scores = {name: gs.best_score_ for name, gs in grid_searches.items()}

# 2. Determine which model had the highest CV F1
best_name = max(best_scores, key=best_scores.get)
best_score = best_scores[best_name]
best_model = grid_searches[best_name].best_estimator_

print(f"Best model overall: {best_name.upper()} with CV F1 = {best_score:.4f}")

# 3. Export the model to a .pkl file
output_path = f"../models/{best_name}_best_model.pkl"
joblib.dump(best_model, output_path)
print(f"Exported {best_name} to {output_path}")


Best model overall: RF with CV F1 = 0.9317
Exported rf to ../models/rf_best_model.pkl
