# Random Forest Baseline for BERT-IDS

This notebook implements a Random Forest baseline model for network intrusion detection using the CICIDS2017 dataset.

## Objectives:
1. Load and preprocess CICIDS2017 dataset
2. Implement comprehensive data preprocessing pipeline
3. Train Random Forest classifier
4. Evaluate model performance with detailed metrics
5. Analyze feature importance
6. Establish baseline performance for BERT-IDS comparison

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import json
import time
from datetime import datetime

# Scikit-learn imports
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_score, recall_score, f1_score, roc_auc_score,
    roc_curve, precision_recall_curve, average_precision_score
)
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif

# Additional utilities
import joblib
from collections import Counter
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Configure settings
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')
np.random.seed(42)

print("📚 Libraries imported successfully!")
print(f"🕐 Notebook started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 1. Data Loading and Initial Setup

In [None]:
# Define paths
DATA_DIR = Path('../data/raw/cicids2017')
PROCESSED_DIR = Path('../data/processed')
MODELS_DIR = Path('../models/checkpoints')
RESULTS_DIR = Path('../results')

# Create directories if they don't exist
for dir_path in [PROCESSED_DIR, MODELS_DIR, RESULTS_DIR]:
    dir_path.mkdir(parents=True, exist_ok=True)

print(f"📁 Data directory: {DATA_DIR}")
print(f"📁 Processed directory: {PROCESSED_DIR}")
print(f"📁 Models directory: {MODELS_DIR}")
print(f"📁 Results directory: {RESULTS_DIR}")

In [None]:
# Enhanced data loading function
def load_and_preprocess_data(data_dir, sample_size=None, test_mode=True):
    """
    Load and preprocess CICIDS2017 dataset.
    
    Args:
        data_dir: Path to data directory
        sample_size: Number of samples to load (None for all)
        test_mode: If True, use smaller sample for testing
    
    Returns:
        Preprocessed DataFrame
    """
    csv_files = list(data_dir.glob('*.csv'))
    
    if not csv_files:
        print("❌ No CSV files found! Creating synthetic data for testing...")
        return create_synthetic_data(sample_size or 10000)
    
    print(f"📖 Loading data from {len(csv_files)} files...")
    
    dataframes = []
    for file in csv_files:
        try:
            print(f"   Loading {file.name}...")
            df = pd.read_csv(file, encoding='utf-8', low_memory=False)
            
            # Clean column names
            df.columns = df.columns.str.strip().str.replace(' ', '_')
            
            dataframes.append(df)
            print(f"   ✅ Loaded {len(df):,} rows")
            
        except Exception as e:
            print(f"   ❌ Error loading {file.name}: {e}")
    
    if not dataframes:
        print("❌ No data loaded! Creating synthetic data...")
        return create_synthetic_data(sample_size or 10000)
    
    # Combine dataframes
    df = pd.concat(dataframes, ignore_index=True)
    print(f"🔄 Combined dataset: {len(df):,} rows, {len(df.columns)} columns")
    
    # Sample if requested
    if sample_size and len(df) > sample_size:
        df = df.sample(n=sample_size, random_state=42)
        print(f"🎲 Sampled {sample_size:,} rows")
    
    return df

def create_synthetic_data(n_samples=10000):
    """
    Create synthetic network traffic data for testing.
    """
    print(f"🔧 Creating synthetic dataset with {n_samples:,} samples...")
    
    np.random.seed(42)
    
    # Create synthetic features similar to network traffic
    data = {
        'Flow_Duration': np.random.exponential(1000, n_samples),
        'Total_Fwd_Packets': np.random.poisson(10, n_samples),
        'Total_Backward_Packets': np.random.poisson(8, n_samples),
        'Total_Length_of_Fwd_Packets': np.random.exponential(500, n_samples),
        'Total_Length_of_Bwd_Packets': np.random.exponential(400, n_samples),
        'Fwd_Packet_Length_Max': np.random.gamma(2, 100, n_samples),
        'Fwd_Packet_Length_Min': np.random.gamma(1, 50, n_samples),
        'Fwd_Packet_Length_Mean': np.random.normal(200, 50, n_samples),
        'Fwd_Packet_Length_Std': np.random.gamma(1, 30, n_samples),
        'Bwd_Packet_Length_Max': np.random.gamma(2, 80, n_samples),
        'Bwd_Packet_Length_Min': np.random.gamma(1, 40, n_samples),
        'Bwd_Packet_Length_Mean': np.random.normal(150, 40, n_samples),
        'Bwd_Packet_Length_Std': np.random.gamma(1, 25, n_samples),
        'Flow_Bytes/s': np.random.exponential(1000, n_samples),
        'Flow_Packets/s': np.random.exponential(10, n_samples),
        'Flow_IAT_Mean': np.random.exponential(100, n_samples),
        'Flow_IAT_Std': np.random.exponential(50, n_samples),
        'Flow_IAT_Max': np.random.exponential(200, n_samples),
        'Flow_IAT_Min': np.random.exponential(10, n_samples),
        'Fwd_IAT_Total': np.random.exponential(500, n_samples),
        'Fwd_IAT_Mean': np.random.exponential(80, n_samples),
        'Fwd_IAT_Std': np.random.exponential(40, n_samples),
        'Fwd_IAT_Max': np.random.exponential(150, n_samples),
        'Fwd_IAT_Min': np.random.exponential(8, n_samples),
        'Bwd_IAT_Total': np.random.exponential(400, n_samples),
        'Bwd_IAT_Mean': np.random.exponential(70, n_samples),
        'Bwd_IAT_Std': np.random.exponential(35, n_samples),
        'Bwd_IAT_Max': np.random.exponential(120, n_samples),
        'Bwd_IAT_Min': np.random.exponential(6, n_samples),
        'Fwd_PSH_Flags': np.random.binomial(5, 0.1, n_samples),
        'Bwd_PSH_Flags': np.random.binomial(3, 0.1, n_samples),
        'Fwd_URG_Flags': np.random.binomial(1, 0.01, n_samples),
        'Bwd_URG_Flags': np.random.binomial(1, 0.01, n_samples),
        'Fwd_Header_Length': np.random.normal(20, 5, n_samples),
        'Bwd_Header_Length': np.random.normal(20, 5, n_samples),
        'Fwd_Packets/s': np.random.exponential(5, n_samples),
        'Bwd_Packets/s': np.random.exponential(4, n_samples),
        'Min_Packet_Length': np.random.gamma(1, 20, n_samples),
        'Max_Packet_Length': np.random.gamma(3, 200, n_samples),
        'Packet_Length_Mean': np.random.normal(180, 60, n_samples),
        'Packet_Length_Std': np.random.gamma(2, 40, n_samples),
        'Packet_Length_Variance': np.random.gamma(3, 500, n_samples),
        'FIN_Flag_Count': np.random.binomial(2, 0.3, n_samples),
        'SYN_Flag_Count': np.random.binomial(2, 0.2, n_samples),
        'RST_Flag_Count': np.random.binomial(1, 0.1, n_samples),
        'PSH_Flag_Count': np.random.binomial(3, 0.15, n_samples),
        'ACK_Flag_Count': np.random.binomial(10, 0.8, n_samples),
        'URG_Flag_Count': np.random.binomial(1, 0.01, n_samples),
        'CWE_Flag_Count': np.random.binomial(1, 0.005, n_samples),
        'ECE_Flag_Count': np.random.binomial(1, 0.005, n_samples),
        'Down/Up_Ratio': np.random.gamma(1, 1, n_samples),
        'Average_Packet_Size': np.random.normal(200, 80, n_samples),
        'Avg_Fwd_Segment_Size': np.random.normal(180, 70, n_samples),
        'Avg_Bwd_Segment_Size': np.random.normal(160, 60, n_samples),
        'Fwd_Header_Length.1': np.random.normal(20, 5, n_samples),
        'Fwd_Avg_Bytes/Bulk': np.random.exponential(100, n_samples),
        'Fwd_Avg_Packets/Bulk': np.random.exponential(5, n_samples),
        'Fwd_Avg_Bulk_Rate': np.random.exponential(50, n_samples),
        'Bwd_Avg_Bytes/Bulk': np.random.exponential(80, n_samples),
        'Bwd_Avg_Packets/Bulk': np.random.exponential(4, n_samples),
        'Bwd_Avg_Bulk_Rate': np.random.exponential(40, n_samples),
        'Subflow_Fwd_Packets': np.random.poisson(8, n_samples),
        'Subflow_Fwd_Bytes': np.random.exponential(400, n_samples),
        'Subflow_Bwd_Packets': np.random.poisson(6, n_samples),
        'Subflow_Bwd_Bytes': np.random.exponential(300, n_samples),
        'Init_Win_bytes_forward': np.random.normal(8192, 2000, n_samples),
        'Init_Win_bytes_backward': np.random.normal(8192, 2000, n_samples),
        'act_data_pkt_fwd': np.random.poisson(5, n_samples),
        'min_seg_size_forward': np.random.gamma(1, 20, n_samples),
        'Active_Mean': np.random.exponential(1000, n_samples),
        'Active_Std': np.random.exponential(500, n_samples),
        'Active_Max': np.random.exponential(2000, n_samples),
        'Active_Min': np.random.exponential(100, n_samples),
        'Idle_Mean': np.random.exponential(5000, n_samples),
        'Idle_Std': np.random.exponential(2000, n_samples),
        'Idle_Max': np.random.exponential(10000, n_samples),
        'Idle_Min': np.random.exponential(500, n_samples)
    }
    
    # Create labels with realistic distribution
    # 80% normal traffic, 20% attacks
    attack_types = ['BENIGN', 'DoS_Hulk', 'PortScan', 'DDoS', 'DoS_GoldenEye', 'FTP-Patator', 'SSH-Patator', 'DoS_slowloris', 'DoS_Slowhttptest', 'Bot', 'Web_Attack_Brute_Force', 'Web_Attack_XSS', 'Infiltration', 'Web_Attack_Sql_Injection', 'Heartbleed']
    
    # Weighted probabilities (BENIGN is most common)
    probabilities = [0.8] + [0.2/14] * 14  # 80% benign, 20% attacks distributed
    labels = np.random.choice(attack_types, n_samples, p=probabilities)
    
    data['Label'] = labels
    
    df = pd.DataFrame(data)
    
    # Add some realistic correlations and noise
    # Make attack traffic slightly different
    attack_mask = df['Label'] != 'BENIGN'
    df.loc[attack_mask, 'Flow_Duration'] *= np.random.uniform(0.5, 2.0, attack_mask.sum())
    df.loc[attack_mask, 'Total_Fwd_Packets'] *= np.random.uniform(1.2, 3.0, attack_mask.sum())
    df.loc[attack_mask, 'Flow_Bytes/s'] *= np.random.uniform(0.3, 1.5, attack_mask.sum())
    
    print(f"✅ Created synthetic dataset: {len(df):,} rows, {len(df.columns)} columns")
    print(f"📊 Label distribution:")
    label_counts = df['Label'].value_counts()
    for label, count in label_counts.items():
        percentage = (count / len(df)) * 100
        print(f"   {label:<25}: {count:>6,} ({percentage:>5.1f}%)")
    
    return df

# Load data
print("📊 Loading CICIDS2017 dataset...")
df = load_and_preprocess_data(DATA_DIR, sample_size=50000)  # Use 50k samples for faster processing

## 2. Data Preprocessing Pipeline

In [None]:
# Comprehensive preprocessing pipeline
class NetworkTrafficPreprocessor:
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.scaler = RobustScaler()  # Robust to outliers
        self.imputer = SimpleImputer(strategy='median')
        self.feature_selector = None
        self.feature_names = None
        self.label_mapping = None
        
    def fit_transform(self, df, target_col='Label', n_features=None):
        """
        Fit preprocessor and transform data.
        """
        print("🔧 Starting preprocessing pipeline...")
        
        # Separate features and target
        X = df.drop(columns=[target_col])
        y = df[target_col]
        
        print(f"   Original shape: {X.shape}")
        
        # 1. Handle infinite values
        print("   🧹 Handling infinite values...")
        X = X.replace([np.inf, -np.inf], np.nan)
        
        # 2. Remove constant features
        print("   🧹 Removing constant features...")
        constant_features = [col for col in X.columns if X[col].nunique() <= 1]
        if constant_features:
            print(f"      Removing {len(constant_features)} constant features")
            X = X.drop(columns=constant_features)
        
        # 3. Handle missing values
        print("   🧹 Imputing missing values...")
        X_imputed = self.imputer.fit_transform(X)
        X = pd.DataFrame(X_imputed, columns=X.columns, index=X.index)
        
        # 4. Feature selection (optional)
        if n_features and n_features < X.shape[1]:
            print(f"   🎯 Selecting top {n_features} features...")
            # Encode labels temporarily for feature selection
            y_encoded = self.label_encoder.fit_transform(y)
            
            self.feature_selector = SelectKBest(score_func=f_classif, k=n_features)
            X_selected = self.feature_selector.fit_transform(X, y_encoded)
            
            # Get selected feature names
            selected_features = X.columns[self.feature_selector.get_support()]
            X = pd.DataFrame(X_selected, columns=selected_features, index=X.index)
            print(f"      Selected features: {list(selected_features[:10])}...")
        
        # 5. Scale features
        print("   ⚖️  Scaling features...")
        X_scaled = self.scaler.fit_transform(X)
        X = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
        
        # 6. Encode labels
        print("   🏷️  Encoding labels...")
        y_encoded = self.label_encoder.fit_transform(y)
        
        # Store feature names and label mapping
        self.feature_names = X.columns.tolist()
        self.label_mapping = dict(zip(self.label_encoder.classes_, 
                                    self.label_encoder.transform(self.label_encoder.classes_)))
        
        print(f"   ✅ Final shape: {X.shape}")
        print(f"   📊 Label mapping: {self.label_mapping}")
        
        return X, y_encoded
    
    def transform(self, df, target_col='Label'):
        """
        Transform new data using fitted preprocessor.
        """
        X = df.drop(columns=[target_col])
        y = df[target_col]
        
        # Apply same transformations
        X = X.replace([np.inf, -np.inf], np.nan)
        X_imputed = self.imputer.transform(X)
        X = pd.DataFrame(X_imputed, columns=X.columns, index=X.index)
        
        if self.feature_selector:
            X_selected = self.feature_selector.transform(X)
            X = pd.DataFrame(X_selected, columns=self.feature_names, index=X.index)
        
        X_scaled = self.scaler.transform(X)
        X = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
        
        y_encoded = self.label_encoder.transform(y)
        
        return X, y_encoded

# Initialize and apply preprocessor
preprocessor = NetworkTrafficPreprocessor()
X_processed, y_processed = preprocessor.fit_transform(df, n_features=50)  # Select top 50 features

print(f"\n📊 Preprocessing Summary:")
print(f"   Features: {X_processed.shape[1]}")
print(f"   Samples: {X_processed.shape[0]}")
print(f"   Classes: {len(np.unique(y_processed))}")
print(f"   Feature names: {preprocessor.feature_names[:10]}...")

## 3. Train-Test Split and Class Balance Analysis

In [None]:
# Split data with stratification
print("🔄 Splitting data into train/validation/test sets...")

# First split: train+val vs test (80-20)
X_temp, X_test, y_temp, y_test = train_test_split(
    X_processed, y_processed, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_processed
)

# Second split: train vs val (75-25 of remaining 80%)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, 
    test_size=0.25, 
    random_state=42, 
    stratify=y_temp
)

print(f"📊 Data split summary:")
print(f"   Training set:   {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X_processed)*100:.1f}%)")
print(f"   Validation set: {X_val.shape[0]:,} samples ({X_val.shape[0]/len(X_processed)*100:.1f}%)")
print(f"   Test set:       {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X_processed)*100:.1f}%)")

# Analyze class distribution
def analyze_class_distribution(y, set_name, label_encoder):
    """
    Analyze and display class distribution.
    """
    unique, counts = np.unique(y, return_counts=True)
    class_names = label_encoder.inverse_transform(unique)
    
    print(f"\n📊 {set_name} class distribution:")
    for class_name, count in zip(class_names, counts):
        percentage = (count / len(y)) * 100
        print(f"   {class_name:<25}: {count:>6,} ({percentage:>5.1f}%)")
    
    return dict(zip(class_names, counts))

train_dist = analyze_class_distribution(y_train, "Training", preprocessor.label_encoder)
val_dist = analyze_class_distribution(y_val, "Validation", preprocessor.label_encoder)
test_dist = analyze_class_distribution(y_test, "Test", preprocessor.label_encoder)

# Calculate class weights for imbalanced data
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

print(f"\n⚖️  Class weights for balancing:")
for class_idx, weight in class_weight_dict.items():
    class_name = preprocessor.label_encoder.inverse_transform([class_idx])[0]
    print(f"   {class_name:<25}: {weight:.3f}")

## 4. Random Forest Model Training

In [None]:
# Configure Random Forest with optimal parameters
print("🌲 Configuring Random Forest classifier...")

rf_params = {
    'n_estimators': 100,          # Number of trees
    'max_depth': 20,              # Maximum depth of trees
    'min_samples_split': 5,       # Minimum samples to split
    'min_samples_leaf': 2,        # Minimum samples in leaf
    'max_features': 'sqrt',       # Number of features per tree
    'bootstrap': True,            # Bootstrap sampling
    'class_weight': class_weight_dict,  # Handle class imbalance
    'random_state': 42,           # Reproducibility
    'n_jobs': -1,                 # Use all CPU cores
    'verbose': 1                  # Show progress
}

print(f"📋 Random Forest parameters:")
for param, value in rf_params.items():
    if param != 'class_weight':
        print(f"   {param:<20}: {value}")
    else:
        print(f"   {param:<20}: balanced")

# Initialize model
rf_model = RandomForestClassifier(**rf_params)

# Train model
print(f"\n🚀 Training Random Forest model...")
start_time = time.time()

rf_model.fit(X_train, y_train)

training_time = time.time() - start_time
print(f"✅ Training completed in {training_time:.2f} seconds")

# Model information
print(f"\n📊 Model Information:")
print(f"   Number of trees: {rf_model.n_estimators}")
print(f"   Number of features: {rf_model.n_features_in_}")
print(f"   Number of classes: {rf_model.n_classes_}")
print(f"   Feature importance available: {hasattr(rf_model, 'feature_importances_')}")

## 5. Model Evaluation

In [None]:
# Comprehensive model evaluation
def evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test, label_encoder):
    """
    Comprehensive model evaluation with multiple metrics.
    """
    print("📊 Evaluating model performance...")
    
    results = {}
    
    # Predictions for all sets
    sets = {
        'Training': (X_train, y_train),
        'Validation': (X_val, y_val),
        'Test': (X_test, y_test)
    }
    
    for set_name, (X, y) in sets.items():
        print(f"\n🔍 {set_name} Set Evaluation:")
        
        # Predictions
        start_time = time.time()
        y_pred = model.predict(X)
        y_pred_proba = model.predict_proba(X)
        prediction_time = time.time() - start_time
        
        # Basic metrics
        accuracy = accuracy_score(y, y_pred)
        precision = precision_score(y, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y, y_pred, average='weighted', zero_division=0)
        
        # Multi-class ROC AUC
        try:
            roc_auc = roc_auc_score(y, y_pred_proba, multi_class='ovr', average='weighted')
        except:
            roc_auc = 0.0
        
        print(f"   Accuracy:  {accuracy:.4f}")
        print(f"   Precision: {precision:.4f}")
        print(f"   Recall:    {recall:.4f}")
        print(f"   F1-Score:  {f1:.4f}")
        print(f"   ROC AUC:   {roc_auc:.4f}")
        print(f"   Prediction time: {prediction_time:.4f}s ({len(X)/prediction_time:.0f} samples/s)")
        
        # Store results
        results[set_name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'roc_auc': roc_auc,
            'prediction_time': prediction_time,
            'y_true': y,
            'y_pred': y_pred,
            'y_pred_proba': y_pred_proba
        }
    
    return results

# Evaluate model
evaluation_results = evaluate_model(
    rf_model, X_train, y_train, X_val, y_val, X_test, y_test, preprocessor.label_encoder
)

In [None]:
# Detailed classification report for test set
print("📋 Detailed Classification Report (Test Set):")
print("=" * 80)

y_test_pred = evaluation_results['Test']['y_pred']
class_names = preprocessor.label_encoder.classes_

report = classification_report(
    y_test, y_test_pred, 
    target_names=class_names,
    digits=4,
    zero_division=0
)
print(report)

# Confusion Matrix
print("\n🔍 Confusion Matrix (Test Set):")
cm = confusion_matrix(y_test, y_test_pred)

# Plot confusion matrix
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix - Random Forest (Test Set)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Per-class accuracy
print("\n🎯 Per-Class Accuracy:")
class_accuracy = cm.diagonal() / cm.sum(axis=1)
for i, (class_name, acc) in enumerate(zip(class_names, class_accuracy)):
    support = cm.sum(axis=1)[i]
    print(f"   {class_name:<25}: {acc:.4f} (support: {support:,})")

## 6. Feature Importance Analysis

In [None]:
# Feature importance analysis
print("🔍 Analyzing feature importance...")

# Get feature importances
feature_importance = rf_model.feature_importances_
feature_names = preprocessor.feature_names

# Create feature importance DataFrame
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print(f"\n📊 Top 20 Most Important Features:")
print("=" * 60)
for i, (_, row) in enumerate(importance_df.head(20).iterrows(), 1):
    print(f"{i:2d}. {row['feature']:<35}: {row['importance']:.6f}")

# Plot feature importance
plt.figure(figsize=(12, 8))
top_features = importance_df.head(20)
sns.barplot(data=top_features, x='importance', y='feature', palette='viridis')
plt.title('Top 20 Feature Importances - Random Forest')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

# Cumulative importance
cumulative_importance = np.cumsum(importance_df['importance'])
n_features_90 = np.argmax(cumulative_importance >= 0.9) + 1
n_features_95 = np.argmax(cumulative_importance >= 0.95) + 1

print(f"\n📈 Cumulative Feature Importance:")
print(f"   Features for 90% importance: {n_features_90}")
print(f"   Features for 95% importance: {n_features_95}")
print(f"   Total features used: {len(feature_names)}")

# Plot cumulative importance
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_importance) + 1), cumulative_importance, 'b-', linewidth=2)
plt.axhline(y=0.9, color='r', linestyle='--', alpha=0.7, label='90% threshold')
plt.axhline(y=0.95, color='orange', linestyle='--', alpha=0.7, label='95% threshold')
plt.axvline(x=n_features_90, color='r', linestyle=':', alpha=0.7)
plt.axvline(x=n_features_95, color='orange', linestyle=':', alpha=0.7)
plt.xlabel('Number of Features')
plt.ylabel('Cumulative Importance')
plt.title('Cumulative Feature Importance')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Cross-Validation and Model Robustness

In [None]:
# Cross-validation for model robustness
print("🔄 Performing cross-validation...")

# Use stratified k-fold for imbalanced data
cv_folds = 5
skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)

# Combine train and validation sets for CV
X_cv = pd.concat([X_train, X_val], ignore_index=True)
y_cv = np.concatenate([y_train, y_val])

print(f"   Using {cv_folds}-fold stratified cross-validation")
print(f"   CV dataset size: {len(X_cv):,} samples")

# Perform cross-validation
cv_start_time = time.time()

cv_scores = cross_val_score(
    rf_model, X_cv, y_cv, 
    cv=skf, 
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1
)

cv_time = time.time() - cv_start_time

print(f"\n📊 Cross-Validation Results:")
print(f"   Mean F1-Score: {cv_scores.mean():.4f} (±{cv_scores.std()*2:.4f})")
print(f"   Individual scores: {[f'{score:.4f}' for score in cv_scores]}")
print(f"   CV time: {cv_time:.2f} seconds")

# Additional CV metrics
cv_metrics = ['accuracy', 'precision_weighted', 'recall_weighted']
cv_results = {}

for metric in cv_metrics:
    scores = cross_val_score(rf_model, X_cv, y_cv, cv=skf, scoring=metric, n_jobs=-1)
    cv_results[metric] = {
        'mean': scores.mean(),
        'std': scores.std(),
        'scores': scores
    }
    print(f"   {metric:<20}: {scores.mean():.4f} (±{scores.std()*2:.4f})")

# Plot CV results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

metrics_to_plot = ['f1_weighted'] + cv_metrics
all_scores = [cv_scores] + [cv_results[m]['scores'] for m in cv_metrics]

for i, (metric, scores) in enumerate(zip(metrics_to_plot, all_scores)):
    axes[i].boxplot(scores, labels=[metric.replace('_', ' ').title()])
    axes[i].set_title(f'{metric.replace("_", " ").title()} Distribution')
    axes[i].set_ylabel('Score')
    axes[i].grid(True, alpha=0.3)
    
    # Add mean line
    axes[i].axhline(y=scores.mean(), color='red', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

## 8. Model Persistence and Results Summary

In [None]:
# Save model and preprocessor
print("💾 Saving model and preprocessor...")

# Create timestamp for versioning
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save model
model_path = MODELS_DIR / f'random_forest_baseline_{timestamp}.joblib'
joblib.dump(rf_model, model_path)
print(f"   ✅ Model saved to: {model_path}")

# Save preprocessor
preprocessor_path = MODELS_DIR / f'preprocessor_{timestamp}.joblib'
joblib.dump(preprocessor, preprocessor_path)
print(f"   ✅ Preprocessor saved to: {preprocessor_path}")

# Save feature importance
importance_path = RESULTS_DIR / f'feature_importance_{timestamp}.csv'
importance_df.to_csv(importance_path, index=False)
print(f"   ✅ Feature importance saved to: {importance_path}")

# Comprehensive results summary
results_summary = {
    'model_info': {
        'model_type': 'RandomForestClassifier',
        'timestamp': timestamp,
        'training_time': training_time,
        'n_estimators': rf_model.n_estimators,
        'max_depth': rf_model.max_depth,
        'n_features': rf_model.n_features_in_,
        'n_classes': rf_model.n_classes_
    },
    'data_info': {
        'total_samples': len(df),
        'train_samples': len(X_train),
        'val_samples': len(X_val),
        'test_samples': len(X_test),
        'n_features_selected': len(preprocessor.feature_names),
        'class_distribution': train_dist
    },
    'performance': {
        'test_accuracy': evaluation_results['Test']['accuracy'],
        'test_precision': evaluation_results['Test']['precision'],
        'test_recall': evaluation_results['Test']['recall'],
        'test_f1_score': evaluation_results['Test']['f1_score'],
        'test_roc_auc': evaluation_results['Test']['roc_auc'],
        'cv_f1_mean': cv_scores.mean(),
        'cv_f1_std': cv_scores.std()
    },
    'feature_analysis': {
        'top_10_features': importance_df.head(10)['feature'].tolist(),
        'features_for_90_percent': n_features_90,
        'features_for_95_percent': n_features_95
    },
    'files': {
        'model_path': str(model_path),
        'preprocessor_path': str(preprocessor_path),
        'importance_path': str(importance_path)
    }
}

# Save results summary
summary_path = RESULTS_DIR / f'baseline_results_summary_{timestamp}.json'
with open(summary_path, 'w') as f:
    json.dump(results_summary, f, indent=2, default=str)
print(f"   ✅ Results summary saved to: {summary_path}")

# Display final summary
print(f"\n🎉 Random Forest Baseline Training Complete!")
print(f"=" * 60)
print(f"📊 Final Performance Summary:")
print(f"   Test Accuracy:  {evaluation_results['Test']['accuracy']:.4f}")
print(f"   Test Precision: {evaluation_results['Test']['precision']:.4f}")
print(f"   Test Recall:    {evaluation_results['Test']['recall']:.4f}")
print(f"   Test F1-Score:  {evaluation_results['Test']['f1_score']:.4f}")
print(f"   Test ROC AUC:   {evaluation_results['Test']['roc_auc']:.4f}")
print(f"   CV F1-Score:    {cv_scores.mean():.4f} (±{cv_scores.std()*2:.4f})")

print(f"\n⏱️  Training Time: {training_time:.2f} seconds")
print(f"🔍 Cross-Validation Time: {cv_time:.2f} seconds")
print(f"🚀 Prediction Speed: {len(X_test)/evaluation_results['Test']['prediction_time']:.0f} samples/second")

print(f"\n📁 Saved Files:")
print(f"   Model: {model_path.name}")
print(f"   Preprocessor: {preprocessor_path.name}")
print(f"   Feature Importance: {importance_path.name}")
print(f"   Results Summary: {summary_path.name}")

print(f"\n🎯 Next Steps:")
print(f"   1. Compare with BERT-IDS model performance")
print(f"   2. Analyze misclassified samples")
print(f"   3. Experiment with feature engineering")
print(f"   4. Try ensemble methods")
print(f"   5. Implement real-time inference pipeline")