In [None]:
!pip install git+https://github.com/pycaret/pycaret.git@master --upgrade

In [None]:
import pandas as pd
import numpy as np
from pycaret.classification import *

# --- 1. Load Your Specific Dataset ---
filename = 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv'
print(f"--- 1. Loading Dataset: {filename} ---")

try:
    # Use 'low_memory=False' AND specify encoding='latin1'
    data = pd.read_csv(filename, low_memory=False, encoding='latin1')
    print("--- Dataset Loaded Successfully ---")
except FileNotFoundError:
    print(f"!!! ERROR: File '{filename}' not found. Did you upload it to Colab?")
    raise
except Exception as e:
    print(f"!!! An error occurred during loading: {e}")
    raise

# --- 2. Data Preprocessing (CRITICAL) ---
print("--- 2. Preprocessing Data ---")

# a. Clean column names
data.columns = data.columns.str.strip()

# b. Convert columns to numeric, coercing errors to NaN
print("   - Converting columns to numbers (errors become NaN)...")
numeric_cols = data.columns.drop(['Label', 'Timestamp'], errors='ignore') # Get columns except Label/Timestamp
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce')

# c. Replace Inf values with NaN
data.replace([np.inf, -np.inf], np.nan, inplace=True)
print("   - Replaced Inf with NaN.")

# d. Create the binary target variable 'Anomaly'
# Make sure 'Label' column exists before proceeding
if 'Label' in data.columns:
    attack_labels = data['Label'].unique()[data['Label'].unique() != 'BENIGN']
    data['Anomaly'] = data['Label'].apply(lambda x: 1 if x in attack_labels else 0)
    print("   - Created 'Anomaly' target column.")
else:
     print("!!! ERROR: 'Label' column not found after loading. Check the CSV.")
     raise ValueError("Missing 'Label' column")

# e. Drop original columns we don't need for training
# Ensure 'Timestamp' column exists before dropping, handle if not present
columns_to_drop = ['Label']
if 'Timestamp' in data.columns:
    columns_to_drop.append('Timestamp')
data.drop(columns=columns_to_drop, axis=1, inplace=True)


# --- DO NOT DROPNA HERE ---
# Instead, check how many NaNs we created
nan_counts = data.isna().sum()
print(f"   - NaN values created during conversion:\n{nan_counts[nan_counts > 0]}") # Show columns with NaNs

print("--- Preprocessing Complete ---")
print("Attack (1) vs. Benign (0) counts in this file (before setup imputation):")
# Check if 'Anomaly' column exists before value_counts
if 'Anomaly' in data.columns:
    print(data['Anomaly'].value_counts())
else:
    print("!!! ERROR: 'Anomaly' column not created. Check 'Label' column processing.")
    raise ValueError("Missing 'Anomaly' column")


# --- 3. Setup the PyCaret Environment ---
print("\n--- 3. Setting up PyCaret Experiment ---")
# PyCaret's setup will now handle the NaN values automatically (default is mean imputation)
clf_setup = setup(
    data=data,
    target='Anomaly',
    session_id=123,
    fix_imbalance=True,
    n_jobs=-1
)


# --- 4. Compare Models for Your Project ---
print("\n--- 4. Comparing Models ---")
best_model = compare_models(sort='F1')


# --- 5. Create the Specific Models You Need ---
print("\n--- 5. Creating Your Specific Models (RF, MLP, SVM) ---")

print("Training Random Forest...")
rf_model = create_model('rf')

print("Training MLP...")
mlp_model = create_model('mlp')

print("Training SVM (Linear Kernel)...")
svm_model = create_model('svm')

print("\n--- All models are trained. You can now analyze them. ---")

In [None]:
# --- 1. Get the LightGBM Model Instance ---
# Since PyCaret's `compare_models` already ran, this pulls the best model to a new variable.
lightgbm_model = create_model('lightgbm')

# --- 2. Plotting Performance ---

# Plot 1: Area Under the Curve (AUC/ROC)
# This confirms the model's overall ability to distinguish Attack (1) vs. Benign (0).
print("\n--- Plotting ROC Curve ---")
plot_model(lightgbm_model, plot='auc')

# Plot 2: Confusion Matrix
# This is CRITICAL for security: shows False Negatives (missed attacks).
print("\n--- Plotting Confusion Matrix ---")
plot_model(lightgbm_model, plot='confusion_matrix')

# Plot 3: Feature Importance
# Identifies which network features were most important for predicting attacks.
print("\n--- Plotting Feature Importance ---")
plot_model(lightgbm_model, plot='feature')

In [None]:
# --- Save the Final Model to the Environment Folder ---

# Use your actual Anaconda path as the destination:
local_save_path = 'C:/Users/mthwm/anaconda3/envs/colab_ml_final_311/NIDS_LightGBM_Final_Model'

save_model(final_ids_model, local_save_path)

print(f"\n✅ Final model saved successfully to: {local_save_path}.pkl")