In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# 1. Load the cleaned dataset
df = pd.read_csv("../data/spambase_cleaned.csv")

# 2. Define Features (X) and Target (y)
X = df.drop(columns="is_spam")
y = df["is_spam"]

# 3. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Test set shape: X_test={X_test.shape}, y_test={y_test.shape}")


Training set shape: X_train=(3368, 57), y_train=(3368,)
Test set shape: X_test=(842, 57), y_test=(842,)


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


print("Data normalization complete.")
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}")

Data normalization complete.
X_train_scaled shape: (3368, 57)
X_test_scaled shape: (842, 57)


In [18]:
from sklearn.model_selection import GridSearchCV

# Grids for each classifier
param_grids = {
    'gnb': {
        # Parameters for GaussianNB directly
        'var_smoothing': [1e-9, 1e-8, 1e-7]
    },
    'lr': {
        # Parameters for LogisticRegression directly
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l2'], 
        'solver': ['lbfgs'] # 'liblinear' supports 'l1' and 'l2'
    },
    'rf': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10] 
    }
}

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

models = {
    'gnb': GaussianNB(),
    'lr': LogisticRegression(max_iter=1000, random_state=42),
    'rf': RandomForestClassifier(random_state=42, n_jobs=-1)
}

# Store search objects
grid_searches = {}

for name in models:
    gs = GridSearchCV(
        estimator=models[name],
        param_grid=param_grids[name],
        scoring='f1',        # optimize for F1-score
        cv=5,                # 5-fold cross-validation
        n_jobs=-1,
        verbose=2
    )
    print(f"Running GridSearch for {name}...")
    gs.fit(X_train_scaled, y_train)  # Use scaled data directly
    grid_searches[name] = gs
    print(f"Best params for {name}: {gs.best_params_}")
    print(f"Best CV F1-score: {gs.best_score_:.3f}\n")

# Evaluate all models with best hyperparameters
for name, gs in grid_searches.items():
    print(f"=== {name.upper()} ===")
    best_model = gs.best_estimator_
    y_pred = best_model.predict(X_test_scaled)  # Use scaled test data
    print("Classification Report:")
    print(classification_report(y_test, y_pred, digits=4))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\n")

# Save best model
best_scores = {name: gs.best_score_ for name, gs in grid_searches.items()}
best_name = max(best_scores, key=best_scores.get)
best_score = best_scores[best_name]
best_model = grid_searches[best_name].best_estimator_

print(f"Best model overall: {best_name.upper()} with CV F1 = {best_score:.4f}")

Running GridSearch for gnb...
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[CV] END ................................var_smoothing=1e-09; total time=   0.0s
[CV] END ................................var_smoothing=1e-09; total time=   0.0s
[CV] END ................................var_smoothing=1e-09; total time=   0.0s
[CV] END ................................var_smoothing=1e-08; total time=   0.0s
[CV] END ................................var_smoothing=1e-08; total time=   0.0s
[CV] END ................................var_smoothing=1e-07; total time=   0.0s
[CV] END ................................var_smoothing=1e-07; total time=   0.0s
[CV] END ................................var_smoothing=1e-07; total time=   0.0s
[CV] END ................................var_smoothing=1e-07; total time=   0.0s
[CV] END ................................var_smoothing=1e-07; total time=   0.0s
[CV] END ................................var_smoothing=1e-09; total time=   0.0s
[CV] END ................................var_smoothing=1e-08; total time=   0.0s
[CV] END ...................

In [21]:
# Export the model and scaler to pkl files
output_path = f"../models/{best_name}_best_model.pkl"
scaler_path = "../models/standard_scaler.pkl"

joblib.dump(best_model, output_path)
joblib.dump(scaler, scaler_path)
print(f"Exported {best_name} model to {output_path}")
print(f"Exported scaler to {scaler_path}")

Exported rf model to ../models/rf_best_model.pkl
Exported scaler to ../models/standard_scaler.pkl
