# 🧪 NSL-KDD Preprocessing for Quantum Intrusion Detection
This notebook processes the NSL-KDD dataset for binary classification and quantum compatibility.

In [8]:
import pandas as pd
import numpy as np
import sys
import os
sys.path.append(os.path.abspath(".."))
from utils.preprocess import load_nsl_kdd, binarize_labels, encode_categorical, scale_features, save_csv

## 📥 Load Data

In [9]:
# Column names from NSL-KDD documentation
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
    'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
    'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files',
    'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count',
    'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label', 'difficulty'
]

# Load train file
train_df = load_nsl_kdd("../data/KDDTrain+.txt", col_names=column_names)

## 🔁 Binarize Labels (normal = 0, attack = 1)

In [10]:
train_df = binarize_labels(train_df, label_column='label')
train_df['label'].value_counts()

label
0    67343
1    58630
Name: count, dtype: int64

## 🔠 Encode Categorical Features

In [11]:
categorical_cols = ['protocol_type', 'service', 'flag']
train_df = encode_categorical(train_df, categorical_cols)
train_df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,491,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,True,False
1,0,146,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,True,False
2,0,0,0,0,0,0,0,0,0,0,...,False,False,False,False,True,False,False,False,False,False
3,0,232,8153,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,True,False
4,0,199,420,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,True,False


## 🔃 Scale Numerical Features to [0, 1]

In [12]:
train_df = scale_features(train_df, label_column='label')
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
duration,125973.0,0.006692,0.060700,0.0,0.0,0.000000e+00,0.000000e+00,1.0
src_bytes,125973.0,0.000033,0.004254,0.0,0.0,3.188489e-08,2.000052e-07,1.0
dst_bytes,125973.0,0.000015,0.003070,0.0,0.0,0.000000e+00,3.939120e-07,1.0
land,125973.0,0.000198,0.014086,0.0,0.0,0.000000e+00,0.000000e+00,1.0
wrong_fragment,125973.0,0.007562,0.084510,0.0,0.0,0.000000e+00,0.000000e+00,1.0
...,...,...,...,...,...,...,...,...
flag_S2,125973.0,0.001008,0.031736,0.0,0.0,0.000000e+00,0.000000e+00,1.0
flag_S3,125973.0,0.000389,0.019719,0.0,0.0,0.000000e+00,0.000000e+00,1.0
flag_SF,125973.0,0.594929,0.490908,0.0,0.0,1.000000e+00,1.000000e+00,1.0
flag_SH,125973.0,0.002151,0.046332,0.0,0.0,0.000000e+00,0.000000e+00,1.0


## 💾 Save Preprocessed Dataset

In [14]:
save_csv(train_df, '../output/nsl_kdd_preprocessed_train.csv')
print('✅ Preprocessed file saved.')

✅ Preprocessed file saved.


In [15]:
# 📥 Load test set
test_df = load_nsl_kdd("../data/KDDTest+.txt", col_names=column_names)



In [16]:
# 🔁 Binary label conversion
test_df = binarize_labels(test_df, label_column='label')



In [17]:
# 🔠 One-hot encode categorical features
test_df = encode_categorical(test_df, categorical_cols)



In [18]:
# 🔃 MinMax scale using same training columns (IMPORTANT)
# Use only columns that were present in train
common_cols = [col for col in train_df.columns if col != "label"]
test_df = test_df.reindex(columns=common_cols + ["label"], fill_value=0)



In [19]:
# 🔃 Normalize (fit-transform separately OR reuse scaler)
test_df_scaled = scale_features(test_df, label_column='label')



In [20]:
# 💾 Save
save_csv(test_df_scaled, '../output/nsl_kdd_preprocessed_test.csv')
print("✅ Test file saved successfully.")

✅ Test file saved successfully.
