In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# 1. Load the cleaned dataset
df = pd.read_csv("../data/spambase_cleaned.csv")

# 2. Define Features (X) and Target (y)
X = df.drop(columns="is_spam")
y = df["is_spam"]

# 3. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Test set shape: X_test={X_test.shape}, y_test={y_test.shape}")


Training set shape: X_train=(3368, 57), y_train=(3368,)
Test set shape: X_test=(842, 57), y_test=(842,)


In [27]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame to maintain column names (optional but helpful)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

print("Data normalization complete.")
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}")

Data normalization complete.
X_train_scaled shape: (3368, 57)
X_test_scaled shape: (842, 57)


In [28]:
# GaussianNB model training
from sklearn.naive_bayes import GaussianNB

# Build and train (now using scaled data directly)
gnb_model = GaussianNB()
gnb_model.fit(X_train_scaled, y_train)

# Evaluate
y_pred_gnb = gnb_model.predict(X_test_scaled)
print("\nGaussian Naive Bayes Results:")
print(classification_report(y_test, y_pred_gnb))


Gaussian Naive Bayes Results:
              precision    recall  f1-score   support

           0       0.96      0.75      0.84       506
           1       0.71      0.95      0.82       336

    accuracy                           0.83       842
   macro avg       0.84      0.85      0.83       842
weighted avg       0.86      0.83      0.83       842



In [29]:
# Logistic Regression model training
from sklearn.linear_model import LogisticRegression

# Build and train
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)

# Evaluate
y_pred_lr = lr_model.predict(X_test_scaled)
print("\nLogistic Regression Results:")
print(classification_report(y_test, y_pred_lr))


Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.94      0.95      0.95       506
           1       0.93      0.90      0.92       336

    accuracy                           0.93       842
   macro avg       0.93      0.93      0.93       842
weighted avg       0.93      0.93      0.93       842



In [30]:
# Random Forest model training
from sklearn.ensemble import RandomForestClassifier

# Build and train
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train_scaled, y_train)

# Evaluate
y_pred_rf = rf_model.predict(X_test_scaled)
print("\nRandom Forest Results:")
print(classification_report(y_test, y_pred_rf))


Random Forest Results:
              precision    recall  f1-score   support

           0       0.95      0.96      0.95       506
           1       0.94      0.93      0.93       336

    accuracy                           0.95       842
   macro avg       0.94      0.94      0.94       842
weighted avg       0.95      0.95      0.95       842



In [31]:
# Grids for each classifier
param_grids = {
    'gnb': {
        # GaussianNB has no major hyperparameters to tune,
        # but we tried var_smoothing if desired:
        'var_smoothing': [1e-9, 1e-8, 1e-7]
    },
    'lr': {
        # LogisticRegression: vary regularization strength C and penalty
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['lbfgs']
    },
    'rf': {
        # RandomForest: number of trees, max depth, and min samples leaf
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_leaf': [1, 2, 4]
    }
}

In [22]:
from sklearn.model_selection import GridSearchCV

# Grids for each classifier
param_grids = {
    'gnb': {
        # GaussianNB has no major hyperparameters to tune,
        # but we tried var_smoothing if desired:
        'classifier__var_smoothing': [1e-9, 1e-8, 1e-7]
    },
    'lr': {
        # LogisticRegression: vary regularization strength C and penalty
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__penalty': ['l2'],
        'classifier__solver': ['lbfgs']
    },
    'rf': {
        # RandomForest: number of trees, max depth, and min samples leaf
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_leaf': [1, 2, 4]
    }
}

In [32]:
# Create base models (without pipelines)
models = {
    'gnb': GaussianNB(),
    'lr': LogisticRegression(max_iter=1000, random_state=42),
    'rf': RandomForestClassifier(random_state=42, n_jobs=-1)
}

# Store search objects
grid_searches = {}

for name in models:
    gs = GridSearchCV(
        estimator=models[name],
        param_grid=param_grids[name],
        scoring='f1',        # optimize for F1-score
        cv=5,                # 5-fold cross-validation
        n_jobs=-1,
        verbose=2
    )
    print(f"Running GridSearch for {name}...")
    gs.fit(X_train_scaled, y_train)  # Use scaled data directly
    grid_searches[name] = gs
    print(f"Best params for {name}: {gs.best_params_}")
    print(f"Best CV F1-score: {gs.best_score_:.3f}\n")

# Evaluate all models with best hyperparameters
for name, gs in grid_searches.items():
    print(f"=== {name.upper()} ===")
    best_model = gs.best_estimator_
    y_pred = best_model.predict(X_test_scaled)  # Use scaled test data
    print("Classification Report:")
    print(classification_report(y_test, y_pred, digits=4))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\n")

# Save best model
best_scores = {name: gs.best_score_ for name, gs in grid_searches.items()}
best_name = max(best_scores, key=best_scores.get)
best_score = best_scores[best_name]
best_model = grid_searches[best_name].best_estimator_

print(f"Best model overall: {best_name.upper()} with CV F1 = {best_score:.4f}")

Running GridSearch for gnb...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ................................var_smoothing=1e-09; total time=   0.0s
[CV] END ................................var_smoothing=1e-09; total time=   0.0s
[CV] END ................................var_smoothing=1e-08; total time=   0.0s
[CV] END ................................var_smoothing=1e-09; total time=   0.0s
[CV] END ................................var_smoothing=1e-09; total time=   0.0s
[CV] END ................................var_smoothing=1e-08; total time=   0.0s
[CV] END ................................var_smoothing=1e-09; total time=   0.0s
[CV] END ................................var_smoothing=1e-08; total time=   0.0s
[CV] END ................................var_smoothing=1e-08; total time=   0.0s
[CV] END ................................var_smoothing=1e-07; total time=   0.0s
[CV] END ................................var_smoothing=1e-07; total time=   0.0s
[CV] END ..........

In [33]:
# Export the model and scaler to pkl files
output_path = f"../models/{best_name}_best_model.pkl"
scaler_path = "../models/standard_scaler.pkl"

joblib.dump(best_model, output_path)
joblib.dump(scaler, scaler_path)
print(f"Exported {best_name} model to {output_path}")
print(f"Exported scaler to {scaler_path}")

# Sample code to show how to use the saved model and scaler together for prediction
print("\nExample of using saved model and scaler for prediction:")
print("loaded_scaler = joblib.load(scaler_path)")
print("loaded_model = joblib.load(output_path)")
print("X_new_scaled = loaded_scaler.transform(X_new)")
print("predictions = loaded_model.predict(X_new_scaled)")

Exported rf model to ../models/rf_best_model.pkl
Exported scaler to ../models/standard_scaler.pkl

Example of using saved model and scaler for prediction:
loaded_scaler = joblib.load(scaler_path)
loaded_model = joblib.load(output_path)
X_new_scaled = loaded_scaler.transform(X_new)
predictions = loaded_model.predict(X_new_scaled)
