# Model Training for Network Intrusion Detection

This notebook trains machine learning models on the processed BCCC-CSE-CIC-IDS2018 dataset.

## Objectives:
1. Load processed features from feature engineering pipeline
2. Train baseline models (Logistic Regression, Random Forest, XGBoost)
3. Handle class imbalance with SMOTE
4. Evaluate model performance
5. Compare models using multiple metrics
6. Save best model for deployment

In [1]:
# Import libraries
import os
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import joblib
import time

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)
from imblearn.over_sampling import SMOTE

import xgboost as xgb

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## 1. Load Processed Data

In [2]:
# Load processed data
project_root = Path().resolve()
processed_dir = project_root / 'data' / 'processed'

print("Loading processed data...")
X_train = pd.read_csv(processed_dir / 'X_train.csv')
X_test = pd.read_csv(processed_dir / 'X_test.csv')
y_train = pd.read_csv(processed_dir / 'y_train.csv')['label'].values
y_test = pd.read_csv(processed_dir / 'y_test.csv')['label'].values

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nClass distribution (train):")
print(f"  Benign (0): {(y_train == 0).sum()} ({(y_train == 0).sum() / len(y_train) * 100:.1f}%)")
print(f"  Attack (1): {(y_train == 1).sum()} ({(y_train == 1).sum() / len(y_train) * 100:.1f}%)")

Loading processed data...
Training set: (190960, 317)
Test set: (47740, 317)

Class distribution (train):
  Benign (0): 182211 (95.4%)
  Attack (1): 8749 (4.6%)


In [3]:
# Load scaler and label encoder for reference
scaler = joblib.load(processed_dir / 'scaler.pkl')
label_encoder = joblib.load(processed_dir / 'label_encoder.pkl')

print("Loaded preprocessing artifacts:")
print(f"  - Scaler: {type(scaler).__name__}")
print(f"  - Label Encoder: {type(label_encoder).__name__}")
print(f"  - Label classes: {label_encoder.classes_}")

Loaded preprocessing artifacts:
  - Scaler: StandardScaler
  - Label Encoder: LabelEncoder
  - Label classes: ['Benign' 'Bot' nan]


## 2. Handle Class Imbalance with SMOTE

In [4]:
# Apply SMOTE to balance classes
print("Applying SMOTE to balance classes...")
print(f"Before SMOTE: {X_train.shape}")
print(f"  Benign: {(y_train == 0).sum()}")
print(f"  Attack: {(y_train == 1).sum()}")

smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"\nAfter SMOTE: {X_train_balanced.shape}")
print(f"  Benign: {(y_train_balanced == 0).sum()}")
print(f"  Attack: {(y_train_balanced == 1).sum()}")
print(f"\nClasses are now balanced!")

Applying SMOTE to balance classes...
Before SMOTE: (190960, 317)
  Benign: 182211
  Attack: 8749

After SMOTE: (364422, 317)
  Benign: 182211
  Attack: 182211

Classes are now balanced!


## 3. Train Baseline Models

In [5]:
# Dictionary to store trained models and results
models = {}
results = {}

### 3.1 Logistic Regression

In [6]:
# Train Logistic Regression
print("Training Logistic Regression...")
start_time = time.time()

lr_model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
lr_model.fit(X_train_balanced, y_train_balanced)

train_time = time.time() - start_time
print(f"Training completed in {train_time:.2f} seconds")

# Predictions
y_pred_lr = lr_model.predict(X_test)
y_pred_proba_lr = lr_model.predict_proba(X_test)[:, 1]

# Store model
models['Logistic Regression'] = lr_model

# Evaluate
results['Logistic Regression'] = {
    'accuracy': accuracy_score(y_test, y_pred_lr),
    'precision': precision_score(y_test, y_pred_lr),
    'recall': recall_score(y_test, y_pred_lr),
    'f1': f1_score(y_test, y_pred_lr),
    'roc_auc': roc_auc_score(y_test, y_pred_proba_lr),
    'train_time': train_time
}

print("\nLogistic Regression Results:")
for metric, value in results['Logistic Regression'].items():
    if metric != 'train_time':
        print(f"  {metric}: {value:.4f}")
    else:
        print(f"  {metric}: {value:.2f}s")

Training Logistic Regression...
Training completed in 23.29 seconds

Logistic Regression Results:
  accuracy: 0.9999
  precision: 0.9973
  recall: 1.0000
  f1: 0.9986
  roc_auc: 1.0000
  train_time: 23.29s


### 3.2 Random Forest

In [7]:
# Train Random Forest
print("Training Random Forest...")
start_time = time.time()

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train_balanced, y_train_balanced)

train_time = time.time() - start_time
print(f"Training completed in {train_time:.2f} seconds")

# Predictions
y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# Store model
models['Random Forest'] = rf_model

# Evaluate
results['Random Forest'] = {
    'accuracy': accuracy_score(y_test, y_pred_rf),
    'precision': precision_score(y_test, y_pred_rf),
    'recall': recall_score(y_test, y_pred_rf),
    'f1': f1_score(y_test, y_pred_rf),
    'roc_auc': roc_auc_score(y_test, y_pred_proba_rf),
    'train_time': train_time
}

print("\nRandom Forest Results:")
for metric, value in results['Random Forest'].items():
    if metric != 'train_time':
        print(f"  {metric}: {value:.4f}")
    else:
        print(f"  {metric}: {value:.2f}s")

Training Random Forest...
Training completed in 15.79 seconds

Random Forest Results:
  accuracy: 0.9925
  precision: 0.8590
  recall: 1.0000
  f1: 0.9241
  roc_auc: 1.0000
  train_time: 15.79s


### 3.3 XGBoost

In [None]:
# Train XGBoost
print("Training XGBoost...")
start_time = time.time()

xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)
xgb_model.fit(X_train_balanced, y_train_balanced)

train_time = time.time() - start_time
print(f"Training completed in {train_time:.2f} seconds")

# Predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Store model
models['XGBoost'] = xgb_model

# Evaluate
results['XGBoost'] = {
    'accuracy': accuracy_score(y_test, y_pred_xgb),
    'precision': precision_score(y_test, y_pred_xgb),
    'recall': recall_score(y_test, y_pred_xgb),
    'f1': f1_score(y_test, y_pred_xgb),
    'roc_auc': roc_auc_score(y_test, y_pred_proba_xgb),
    'train_time': train_time
}

print("\nXGBoost Results:")
for metric, value in results['XGBoost'].items():
    if metric != 'train_time':
        print(f"  {metric}: {value:.4f}")
    else:
        print(f"  {metric}: {value:.2f}s")

Training XGBoost...


## 4. Model Comparison

In [None]:
# Create comparison DataFrame
results_df = pd.DataFrame(results).T
print("\n" + "="*80)
print("MODEL COMPARISON")
print("="*80)
print(results_df.to_string())
print("="*80)

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

metrics = ['accuracy', 'precision', 'recall', 'f1']
for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    
    values = [results[model][metric] for model in results.keys()]
    bars = ax.bar(results.keys(), values, color=['steelblue', 'orange', 'green'][:len(results)])
    
    ax.set_ylabel(metric.capitalize())
    ax.set_title(f'{metric.capitalize()} Comparison')
    ax.set_ylim([0, 1.05])
    ax.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 5. Detailed Evaluation - Best Model

In [None]:
# Identify best model based on F1 score
best_model_name = max(results, key=lambda x: results[x]['f1'])
best_model = models[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"F1 Score: {results[best_model_name]['f1']:.4f}")

# Get predictions from best model
y_pred_best = best_model.predict(X_test)
y_pred_proba_best = best_model.predict_proba(X_test)[:, 1]

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_best)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Benign', 'Attack'],
            yticklabels=['Benign', 'Attack'])
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title(f'Confusion Matrix - {best_model_name}')
plt.show()

print("\nConfusion Matrix Values:")
print(f"True Negatives (TN): {cm[0, 0]}")
print(f"False Positives (FP): {cm[0, 1]}")
print(f"False Negatives (FN): {cm[1, 0]}")
print(f"True Positives (TP): {cm[1, 1]}")

In [None]:
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best, target_names=['Benign', 'Attack']))

In [None]:
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_best)
auc_score = roc_auc_score(y_test, y_pred_proba_best)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, linewidth=2, label=f'{best_model_name} (AUC = {auc_score:.4f})')
plt.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

## 6. Feature Importance (for tree-based models)

In [None]:
# Feature importance for Random Forest or XGBoost
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Plot top 20 features
    plt.figure(figsize=(10, 8))
    top_features = feature_importance.head(20)
    plt.barh(range(len(top_features)), top_features['importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Top 20 Feature Importances - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10).to_string(index=False))
else:
    print(f"{best_model_name} does not support feature importances.")

## 7. Save Best Model

In [None]:
# Save best model
models_dir = project_root / 'models'
models_dir.mkdir(parents=True, exist_ok=True)

model_path = models_dir / f'best_model_{best_model_name.lower().replace(" ", "_")}.pkl'
joblib.dump(best_model, model_path)

print(f"\nBest model saved to: {model_path}")

# Save model metadata
metadata = {
    'model_name': best_model_name,
    'model_type': type(best_model).__name__,
    'metrics': results[best_model_name],
    'training_samples': len(X_train_balanced),
    'features': list(X_train.columns),
    'n_features': len(X_train.columns)
}

metadata_path = models_dir / 'model_metadata.pkl'
joblib.dump(metadata, metadata_path)

print(f"Model metadata saved to: {metadata_path}")

## 8. Summary

In [None]:
# Final Summary
print("="*80)
print("TRAINING SUMMARY")
print("="*80)
print(f"\nDataset:")
print(f"  Training samples (after SMOTE): {len(X_train_balanced):,}")
print(f"  Test samples: {len(X_test):,}")
print(f"  Number of features: {X_train.shape[1]}")
print(f"\nModels Trained: {len(models)}")
for model_name in models.keys():
    print(f"  - {model_name}")
print(f"\nBest Model: {best_model_name}")
print(f"  Accuracy: {results[best_model_name]['accuracy']:.4f}")
print(f"  Precision: {results[best_model_name]['precision']:.4f}")
print(f"  Recall: {results[best_model_name]['recall']:.4f}")
print(f"  F1 Score: {results[best_model_name]['f1']:.4f}")
print(f"  ROC-AUC: {results[best_model_name]['roc_auc']:.4f}")
print(f"\nModel saved to: {model_path}")
print("="*80)

## Next Steps

1. **Hyperparameter Tuning**: Use GridSearchCV or Optuna to find optimal hyperparameters
2. **Feature Selection**: Try reducing features to improve model performance and speed
3. **Ensemble Methods**: Combine multiple models for better predictions
4. **Cross-Validation**: Perform k-fold cross-validation for more robust evaluation
5. **Deployment**: Use the saved model in the inference pipeline (`src/inference_pipeline/`)
6. **API Integration**: Integrate with FastAPI service (`src/api/main.py`)
7. **Dashboard**: Visualize predictions in Streamlit dashboard (`app.py`)