In [1]:
%pip install -q catboost lightgbm pytorch-tabnet kagglehub scikit-learn pandas Flask wheel


Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install -q numpy==1.26.4 pandas==2.2.2 --force-reinstall

Note: you may need to restart the kernel to use updated packages.


In [3]:
import kagglehub
import pandas as pd
import numpy as np
import os
import pickle
import torch
import lightgbm as lgb

from catboost import CatBoostClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

from lightgbm import early_stopping, log_evaluation


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
path = kagglehub.dataset_download("hassan06/nslkdd")
print("Path to dataset files:", path)

train_file_path = os.path.join(path, 'KDDTrain+.txt')
test_file_path = os.path.join(path, 'KDDTest+.txt')

column_names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label', 'difficulty']

wireshark_features = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
                      'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'label']

df_train = pd.read_csv(train_file_path, names=column_names, index_col=False)
df_test  = pd.read_csv(test_file_path, names=column_names, index_col=False)
df_train = df_train[wireshark_features]
df_test  = df_test[wireshark_features]

def convert_label(x):
    return 0 if str(x).strip().lower() == 'normal' else 1

df_train['label'] = df_train['label'].apply(convert_label)
df_test['label']  = df_test['label'].apply(convert_label)

categorical_features = ['protocol_type', 'service', 'flag']
numerical_features = [col for col in wireshark_features if col not in categorical_features + ['label']]

df_combined = pd.concat([df_train, df_test], ignore_index=True)
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numerical_features)
])

X_combined = df_combined.drop('label', axis=1)
y_combined = df_combined['label']

X_train = X_combined.iloc[:len(df_train), :]
X_test  = X_combined.iloc[len(df_train):, :]
y_train = y_combined.iloc[:len(df_train)]
y_test  = y_combined.iloc[len(df_train):]

X_train = preprocessor.fit_transform(X_train)
X_test  = preprocessor.transform(X_test)

with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)


Path to dataset files: C:\Users\Zeeshan\.cache\kagglehub\datasets\hassan06\nslkdd\versions\1


In [5]:
catboost_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, verbose=200)
catboost_model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=10)

y_pred_cat = catboost_model.predict(X_test)
cat_accuracy  = accuracy_score(y_test, y_pred_cat)
cat_precision = precision_score(y_test, y_pred_cat)
cat_recall    = recall_score(y_test, y_pred_cat)
cat_f1        = f1_score(y_test, y_pred_cat)

print("CatBoost Metrics:")
print(f"  Accuracy: {cat_accuracy:.2f}")
print(f"  Precision: {cat_precision:.2f}")
print(f"  Recall: {cat_recall:.2f}")
print(f"  F1 Score: {cat_f1:.2f}")


0:	learn: 0.4652940	test: 0.5415290	best: 0.5415290 (0)	total: 207ms	remaining: 3m 26s
Stopped by overfitting detector  (10 iterations wait)

bestTest = 0.3880590668
bestIteration = 5

Shrink model to first 6 iterations.
CatBoost Metrics:
  Accuracy: 0.84
  Precision: 0.96
  Recall: 0.75
  F1 Score: 0.84


In [6]:
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

lgb_model = lgb.train(
    {
        'objective': 'binary',
        'metric': 'binary_error',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.1,
        'feature_fraction': 0.9
    },
    train_data,
    num_boost_round=1000,
    valid_sets=[valid_data],
    callbacks=[early_stopping(stopping_rounds=10), log_evaluation(100)]
)

y_pred_lgb = (lgb_model.predict(X_test) >= 0.5).astype(int)
lgb_accuracy  = accuracy_score(y_test, y_pred_lgb)
lgb_precision = precision_score(y_test, y_pred_lgb)
lgb_recall    = recall_score(y_test, y_pred_lgb)
lgb_f1        = f1_score(y_test, y_pred_lgb)

print("\nLightGBM Metrics:")
print(f"  Accuracy: {lgb_accuracy:.2f}")
print(f"  Precision: {lgb_precision:.2f}")
print(f"  Recall: {lgb_recall:.2f}")
print(f"  F1 Score: {lgb_f1:.2f}")


[LightGBM] [Info] Number of positive: 58630, number of negative: 67343
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007252 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 922
[LightGBM] [Info] Number of data points in the train set: 125973, number of used features: 80
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.465417 -> initscore=-0.138552
[LightGBM] [Info] Start training from score -0.138552
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[21]	valid_0's binary_error: 0.149264

LightGBM Metrics:
  Accuracy: 0.85
  Precision: 0.97
  Recall: 0.76
  F1 Score: 0.85


In [8]:
tabnet_model = TabNetClassifier(
    n_d=8, n_a=8, n_steps=3, gamma=1.3, lambda_sparse=1e-5,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type="entmax"
)

tabnet_model.fit(
    X_train, y_train.values,
    eval_set=[(X_test, y_test.values)],
    max_epochs=15, patience=10,
    batch_size=1024, virtual_batch_size=128
)

y_pred_tabnet = tabnet_model.predict(X_test)
tabnet_accuracy  = accuracy_score(y_test, y_pred_tabnet)
tabnet_precision = precision_score(y_test, y_pred_tabnet)
tabnet_recall    = recall_score(y_test, y_pred_tabnet)
tabnet_f1        = f1_score(y_test, y_pred_tabnet)

print("\nTabNet Metrics:")
print(f"  Accuracy: {tabnet_accuracy:.2f}")
print(f"  Precision: {tabnet_precision:.2f}")
print(f"  Recall: {tabnet_recall:.2f}")
print(f"  F1 Score: {tabnet_f1:.2f}")




epoch 0  | loss: 0.21893 | val_0_auc: 0.7324  |  0:00:22s
epoch 1  | loss: 0.10504 | val_0_auc: 0.73311 |  0:00:40s
epoch 2  | loss: 0.09628 | val_0_auc: 0.93909 |  0:01:01s
epoch 3  | loss: 0.09157 | val_0_auc: 0.93383 |  0:01:19s
epoch 4  | loss: 0.0906  | val_0_auc: 0.76017 |  0:01:40s
epoch 5  | loss: 0.09013 | val_0_auc: 0.91617 |  0:01:59s
epoch 6  | loss: 0.0889  | val_0_auc: 0.94618 |  0:02:17s
epoch 7  | loss: 0.08861 | val_0_auc: 0.91547 |  0:02:36s
epoch 8  | loss: 0.08698 | val_0_auc: 0.94665 |  0:02:55s
epoch 9  | loss: 0.087   | val_0_auc: 0.60365 |  0:03:15s
epoch 10 | loss: 0.08633 | val_0_auc: 0.6049  |  0:03:35s
epoch 11 | loss: 0.08569 | val_0_auc: 0.8648  |  0:03:56s
epoch 12 | loss: 0.08491 | val_0_auc: 0.63445 |  0:04:14s
epoch 13 | loss: 0.08452 | val_0_auc: 0.72411 |  0:04:33s
epoch 14 | loss: 0.08654 | val_0_auc: 0.93389 |  0:04:51s
Stop training because you reached max_epochs = 15 with best_epoch = 8 and best_val_0_auc = 0.94665





TabNet Metrics:
  Accuracy: 0.78
  Precision: 0.90
  Recall: 0.70
  F1 Score: 0.79


In [11]:
model_metrics = {
    "CatBoost": cat_f1,
    "LightGBM": lgb_f1,
    "TabNet": tabnet_f1
}

best_model_name = max(model_metrics, key=model_metrics.get)
print("\n-------------------------")
print("Model Comparison (Based on F1 Score):")
for model_name, f1 in model_metrics.items():
    print(f"  {model_name}: F1 Score = {f1:.2f}")

print(f"\nMost Efficient Model: {best_model_name} (Highest F1 Score)")

if best_model_name == "CatBoost":
    best_model_obj = catboost_model
elif best_model_name == "LightGBM":
    best_model_obj = lgb_model
elif best_model_name == "TabNet":
    best_model_obj = tabnet_model

best_model_filename = os.path.join(os.getcwd(), 'best_model.pkl')
with open(best_model_filename, 'wb') as f:
    pickle.dump(best_model_obj, f)

print(f"Best model saved as {best_model_filename}")


-------------------------
Model Comparison (Based on F1 Score):
  CatBoost: F1 Score = 0.84
  LightGBM: F1 Score = 0.85
  TabNet: F1 Score = 0.79

Most Efficient Model: LightGBM (Highest F1 Score)
Best model saved as c:\Users\Zeeshan\Desktop\Real-Time-Network-Intrusion-Detection-with-WireShark\backend\best_model.pkl
