In [10]:
# Import libraries
import os
from notebooks.training_utils import (
    load_training_data, train_and_evaluate, save_models, 
    log_to_mlflow, print_summary
)
import numpy as np


# Fix matplotlib backend issue with LightGBM in notebooks
os.environ['MPLBACKEND'] = 'agg'

try:
    import lightgbm as lgb
    import mlflow
    import mlflow.lightgbm
    LIGHTGBM_AVAILABLE = True
    print("✅ LightGBM available")
except ImportError as e:
    LIGHTGBM_AVAILABLE = False
    print(f"⚠️  LightGBM not installed: {e}")
    print("    Install with: pip install lightgbm")
    print("    Skipping this notebook...")

# Setup MLflow
if LIGHTGBM_AVAILABLE:
    mlflow.set_tracking_uri("file:./mlruns")
    mlflow.set_experiment("network-intrusion-detection")
    print(f"MLflow Tracking URI: {mlflow.get_tracking_uri()}")

✅ LightGBM available
MLflow Tracking URI: file:./mlruns


## 1. Load Data

In [11]:
if LIGHTGBM_AVAILABLE:
    # Load SMOTE data
    X_train_smote, X_test, y_train_smote, y_test, project_root = load_training_data(use_smote=True)

    # Load original data for scale_pos_weight strategy
    X_train, _, y_train, _, _ = load_training_data(use_smote=False)

    # Calculate scale_pos_weight
    scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)
    print(f"Scale pos weight: {scale_pos_weight:.2f}")

Loading SMOTE training data...
  Training set: (446182, 334)
  Test set: (57960, 334)
  Train class distribution: Benign=223091, Attack=223091
Loading original training data...
  Training set: (231839, 334)
  Test set: (57960, 334)
  Train class distribution: Benign=223091, Attack=8748
Scale pos weight: 25.50


## 2. Train LightGBM Models

In [12]:
if LIGHTGBM_AVAILABLE:
    # SMOTE Strategy
    lgb_smote = lgb.LGBMClassifier(
        n_estimators=100,
        max_depth=10,
        learning_rate=0.1,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
    lgb_smote, metrics_smote = train_and_evaluate(
        lgb_smote, X_train_smote, y_train_smote, X_test, y_test,
        "LightGBM - SMOTE Strategy"
    )

    # Scale Pos Weight Strategy
    lgb_weighted = lgb.LGBMClassifier(
        n_estimators=100,
        max_depth=10,
        learning_rate=0.1,
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
    lgb_weighted, metrics_weighted = train_and_evaluate(
        lgb_weighted, X_train, y_train, X_test, y_test,
        "LightGBM - Scale Pos Weight Strategy"
    )

TRAINING: LightGBM - SMOTE Strategy
✅ Training completed in 6.33 seconds

Test Set Metrics:
  accuracy: 1.0000
  precision: 0.9995
  recall: 1.0000
  f1: 0.9998
  roc_auc: 1.0000
  pr_auc: 1.0000
  train_time: 6.33s
TRAINING: LightGBM - Scale Pos Weight Strategy
✅ Training completed in 4.76 seconds

Test Set Metrics:
  accuracy: 1.0000
  precision: 0.9995
  recall: 1.0000
  f1: 0.9998
  roc_auc: 1.0000
  pr_auc: 1.0000
  train_time: 4.76s


## 3. Save Models

In [13]:
if LIGHTGBM_AVAILABLE:
    save_models(lgb_smote, lgb_weighted, metrics_smote, metrics_weighted, 'lgb', project_root)

✅ Saved: /Users/matthewweaver/Repositories/nidstream/models/lgb_smote.pkl
✅ Saved: /Users/matthewweaver/Repositories/nidstream/models/lgb_weighted.pkl
✅ Saved metrics: /Users/matthewweaver/Repositories/nidstream/models/lgb_metrics.pkl


## 4. Log to MLflow

In [14]:
if LIGHTGBM_AVAILABLE:
    # Log SMOTE model
    log_to_mlflow(
        lgb_smote, metrics_smote, "LGB_SMOTE", "LightGBM", "SMOTE",
        {"n_estimators": 100, "max_depth": 10, "learning_rate": 0.1},
        X_train_smote, X_test, y_train_smote,
        mlflow.lightgbm
    )

    # Log Weighted model
    log_to_mlflow(
        lgb_weighted, metrics_weighted, "LGB_ScalePosWeight", "LightGBM", "Scale_Pos_Weight",
        {"n_estimators": 100, "max_depth": 10, "learning_rate": 0.1, "scale_pos_weight": float(scale_pos_weight)},
        X_train, X_test, y_train,
        mlflow.lightgbm
    )

    print("\n✅ All models logged to MLflow")

Logging LGB_SMOTE to MLflow...




  ✅ Run ID: b3777fdcba8447b2a8a382c44a5f2c06
Logging LGB_ScalePosWeight to MLflow...




  ✅ Run ID: 6c988f2658e14938bb858e2a9ba76006

✅ All models logged to MLflow


## 5. Summary

In [15]:
if LIGHTGBM_AVAILABLE:
    print_summary(metrics_smote, metrics_weighted, "LightGBM")


LIGHTGBM TRAINING COMPLETE

SMOTE Strategy:
  PR-AUC: 1.0000
  F1 Score: 0.9998
  Recall: 1.0000

Class Weight Strategy:
  PR-AUC: 1.0000
  F1 Score: 0.9998
  Recall: 1.0000

✅ Better strategy for LightGBM: SMOTE
