In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# 1. Load the cleaned dataset
df = pd.read_csv("../data/spambase_cleaned.csv")

# 2. Define Features (X) and Target (y)
X = df.drop(columns="is_spam")
y = df["is_spam"]

# 3. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Test set shape: X_test={X_test.shape}, y_test={y_test.shape}")


Training set shape: X_train=(3368, 57), y_train=(3368,)
Test set shape: X_test=(842, 57), y_test=(842,)


In [3]:
# Initialize StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data scaling complete.")
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}")


Data scaling complete.
X_train_scaled shape: (3368, 57)
X_test_scaled shape: (842, 57)


In [7]:
# Initialize a base Random Forest model instance
# random_state is set for reproducibility, n_jobs for parallelism
rf_model_for_tuning = RandomForestClassifier(random_state=42, n_jobs=-1)

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV
grid_search_rf = GridSearchCV(
    estimator=rf_model_for_tuning, # Pass the model directly
    param_grid=param_grid_rf,
    scoring='f1',
    cv=5,
    verbose=2,
    n_jobs=-1
)

# Fit GridSearchCV to the SCALED training data
grid_search_rf.fit(X_train_scaled, y_train)

print("\nGridSearchCV complete.")
print(f"Best F1-score from CV: {grid_search_rf.best_score_:.4f}")
print(f"Best parameters found: {grid_search_rf.best_params_}")

# Get the best estimator
best_rf_model = grid_search_rf.best_estimator_

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END max_de

In [9]:
# Evaluate the best model found by GridSearchCV on the SCALED test set
print("\nEvaluating the best tuned Random Forest model on the scaled test set...")
y_pred_best_rf = best_rf_model.predict(X_test_scaled) # Use X_test_scaled

print("\nClassification Report (Best Tuned Random Forest):")
print(classification_report(y_test, y_pred_best_rf, digits=4))

print("\nConfusion Matrix (Best Tuned Random Forest):")
print(confusion_matrix(y_test, y_pred_best_rf))


Evaluating the best tuned Random Forest model on the scaled test set...

Classification Report (Best Tuned Random Forest):
              precision    recall  f1-score   support

           0     0.9473    0.9585    0.9528       506
           1     0.9364    0.9196    0.9279       336

    accuracy                         0.9430       842
   macro avg     0.9418    0.9391    0.9404       842
weighted avg     0.9429    0.9430    0.9429       842


Confusion Matrix (Best Tuned Random Forest):
[[485  21]
 [ 27 309]]


In [10]:
# Export the best Random Forest model
model_output_path = "../models/rf_best_model.pkl"
joblib.dump(best_rf_model, model_output_path)
print(f"\nBest tuned model exported to: {model_output_path}")


Best tuned model exported to: ../models/rf_best_model.pkl
