In [1]:
# Import libraries
from notebooks.training_utils import load_training_data, save_models, log_to_mlflow, print_summary
from sklearn.naive_bayes import GaussianNB
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)
import time
import mlflow
import mlflow.sklearn

# Setup MLflow
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("network-intrusion-detection")

print("âœ… Libraries imported")
print(f"MLflow Tracking URI: {mlflow.get_tracking_uri()}")

âœ… Libraries imported
MLflow Tracking URI: file:./mlruns


  return FileStore(store_uri, store_uri)


## 1. Load Data

In [2]:
# Load SMOTE data
X_train_smote, X_test, y_train_smote, y_test, project_root = load_training_data(use_smote=True)

# Load original data for sample weight strategy
X_train, _, y_train, _, _ = load_training_data(use_smote=False)

print("\nðŸ’¡ Note: GaussianNB doesn't support class_weight parameter.")
print("   We'll use sample_weight in the fit method instead.")

Loading SMOTE training data...
  Training set: (446182, 334)
  Test set: (57960, 334)
  Train class distribution: Benign=223091, Attack=223091
Loading original training data...
  Training set: (231839, 334)
  Test set: (57960, 334)
  Train class distribution: Benign=223091, Attack=8748

ðŸ’¡ Note: GaussianNB doesn't support class_weight parameter.
   We'll use sample_weight in the fit method instead.


## 2. Train Naive Bayes Models

In [3]:
# SMOTE Strategy
print("="*80)
print("TRAINING: Naive Bayes - SMOTE Strategy")
print("="*80)

start_time = time.time()
nb_smote = GaussianNB()
nb_smote.fit(X_train_smote, y_train_smote)
train_time_smote = time.time() - start_time

print(f"âœ… Training completed in {train_time_smote:.2f} seconds")

# Evaluate
y_pred = nb_smote.predict(X_test)
y_pred_proba = nb_smote.predict_proba(X_test)[:, 1]

metrics_smote = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred, zero_division=0),
    'recall': recall_score(y_test, y_pred, zero_division=0),
    'f1': f1_score(y_test, y_pred, zero_division=0),
    'roc_auc': roc_auc_score(y_test, y_pred_proba),
    'pr_auc': average_precision_score(y_test, y_pred_proba),
    'train_time': train_time_smote
}

print("\nTest Set Metrics:")
for metric, value in metrics_smote.items():
    if metric != 'train_time':
        print(f"  {metric}: {value:.4f}")
    else:
        print(f"  {metric}: {value:.2f}s")

# Sample Weight Strategy
print("\n" + "="*80)
print("TRAINING: Naive Bayes - Sample Weight Strategy")
print("="*80)

# Compute sample weights
sample_weights = compute_sample_weight('balanced', y_train)

start_time = time.time()
nb_weighted = GaussianNB()
nb_weighted.fit(X_train, y_train, sample_weight=sample_weights)
train_time_weighted = time.time() - start_time

print(f"âœ… Training completed in {train_time_weighted:.2f} seconds")

# Evaluate
y_pred = nb_weighted.predict(X_test)
y_pred_proba = nb_weighted.predict_proba(X_test)[:, 1]

metrics_weighted = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred, zero_division=0),
    'recall': recall_score(y_test, y_pred, zero_division=0),
    'f1': f1_score(y_test, y_pred, zero_division=0),
    'roc_auc': roc_auc_score(y_test, y_pred_proba),
    'pr_auc': average_precision_score(y_test, y_pred_proba),
    'train_time': train_time_weighted
}

print("\nTest Set Metrics:")
for metric, value in metrics_weighted.items():
    if metric != 'train_time':
        print(f"  {metric}: {value:.4f}")
    else:
        print(f"  {metric}: {value:.2f}s")

TRAINING: Naive Bayes - SMOTE Strategy
âœ… Training completed in 0.79 seconds

Test Set Metrics:
  accuracy: 1.0000
  precision: 0.9995
  recall: 0.9995
  f1: 0.9995
  roc_auc: 0.9998
  pr_auc: 0.9991
  train_time: 0.79s

TRAINING: Naive Bayes - Sample Weight Strategy
âœ… Training completed in 0.91 seconds

Test Set Metrics:
  accuracy: 1.0000
  precision: 0.9995
  recall: 0.9995
  f1: 0.9995
  roc_auc: 0.9998
  pr_auc: 0.9991
  train_time: 0.91s


## 3. Save Models

In [4]:
save_models(nb_smote, nb_weighted, metrics_smote, metrics_weighted, 'nb', project_root)

âœ… Saved: /Users/matthewweaver/Repositories/nidstream/models/nb_smote.pkl
âœ… Saved: /Users/matthewweaver/Repositories/nidstream/models/nb_weighted.pkl
âœ… Saved metrics: /Users/matthewweaver/Repositories/nidstream/models/metrics/nb_metrics.pkl


## 4. Log to MLflow

In [5]:
# Log SMOTE model
log_to_mlflow(
    nb_smote, metrics_smote, "NB_SMOTE", "NaiveBayes", "SMOTE",
    {"algorithm": "GaussianNB"},
    X_train_smote, X_test, y_train_smote,
    mlflow.sklearn
)

# Log Weighted model
log_to_mlflow(
    nb_weighted, metrics_weighted, "NB_Weighted", "NaiveBayes", "Sample_Weight",
    {"algorithm": "GaussianNB", "sample_weight": "balanced"},
    X_train, X_test, y_train,
    mlflow.sklearn
)

print("\nâœ… All models logged to MLflow")

Logging NB_SMOTE to MLflow...




  âœ… Run ID: 03105928da7a4f74ae4eb3b561057c12
Logging NB_Weighted to MLflow...




  âœ… Run ID: 3299e64c89a547baa7223d316824cbe8

âœ… All models logged to MLflow


## 5. Summary

In [6]:
print_summary(metrics_smote, metrics_weighted, "Naive Bayes")

print("\nðŸ’¡ Note: Naive Bayes is fast but assumes feature independence.")
print("   It may underperform compared to tree-based models for this problem.")
print("   Consider using RF, XGB, or LGB for better performance.")


NAIVE BAYES TRAINING COMPLETE

SMOTE Strategy:
  PR-AUC: 0.9991
  F1 Score: 0.9995
  Recall: 0.9995

Class Weight Strategy:
  PR-AUC: 0.9991
  F1 Score: 0.9995
  Recall: 0.9995

âœ… Better strategy for Naive Bayes: Class Weight

ðŸ’¡ Note: Naive Bayes is fast but assumes feature independence.
   It may underperform compared to tree-based models for this problem.
   Consider using RF, XGB, or LGB for better performance.
