# 1. Import Required Libraries

In [41]:
import numpy as np
import pandas as pd
import pickle  # For saving and loading trained models
from os import path

# Importing libraries for data normalization
from sklearn import preprocessing
from sklearn.preprocessing import (
    StandardScaler, OrdinalEncoder, LabelEncoder, MinMaxScaler, 
    OneHotEncoder, Normalizer, MaxAbsScaler, RobustScaler, PowerTransformer
)

# Importing libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Importing machine learning libraries
from sklearn import metrics
from sklearn.metrics import (
    accuracy_score, classification_report, precision_score, 
    recall_score, f1_score, roc_auc_score, roc_curve, auc, confusion_matrix
)
from sklearn.model_selection import train_test_split

# Importing TensorFlow & Keras for deep learning
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from keras.layers import Dense, Input
from keras.models import Model
from keras.utils import plot_model

# Importing SHAP for explainability
import shap

 # 2. Load Dataset

In [42]:
# Define feature column names
feature = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", 
    "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in", "num_compromised", 
    "root_shell", "su_attempted", "num_root", "num_file_creations", "num_shells",
    "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", "count", 
    "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", 
    "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", 
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", 
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label", "difficulty"
]

# Load training and test datasets
train = "../../Datasets/NSL-KDD/KDDTrain+.txt"
test = "../../Datasets/NSL-KDD/KDDTest+.txt"

df = pd.read_csv(train, names=feature)
df_test = pd.read_csv(test, names=feature)

# Display dataset dimensions
print('Dimensions of the Training set:', df.shape)
print('Dimensions of the Test set:', df_test.shape)

Dimensions of the Training set: (125973, 43)
Dimensions of the Test set: (22544, 43)


# 3. Drop Unnecessary Columns

In [43]:
df.drop(['difficulty'], axis=1, inplace=True)
df_test.drop(['difficulty'], axis=1, inplace=True)

# 4. Check Label Distribution

In [44]:
print('Label distribution in Training set:')
print(df['label'].value_counts())

Label distribution in Training set:
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: label, dtype: int64


In [45]:
print('\nLabel distribution in Test set:')
print(df_test['label'].value_counts())


Label distribution in Test set:
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess           331
saint               319
mailbomb            293
snmpgetattack       178
portsweep           157
ipsweep             141
httptunnel          133
nmap                 73
pod                  41
buffer_overflow      20
multihop             18
named                17
ps                   15
sendmail             14
rootkit              13
xterm                13
teardrop             12
xlock                 9
land                  7
xsnoop                4
ftp_write             3
worm                  2
loadmodule            2
perl                  2
sqlattack             2
udpstorm              2
phf                   2
imap                  1
Name: label, dtype: int64


# 5. Identify Categorical Features

In [46]:
print('Training set:')
for col in df.columns:
    if df[col].dtypes == 'object':
        print(f"Feature '{col}' has {df[col].nunique()} unique categories")

print('\nDistribution of service categories:')
print(df['service'].value_counts().head())

print('\nTest set:')
for col in df_test.columns:
    if df_test[col].dtypes == 'object':
        print(f"Feature '{col}' has {df_test[col].nunique()} unique categories")


Training set:
Feature 'protocol_type' has 3 unique categories
Feature 'service' has 70 unique categories
Feature 'flag' has 11 unique categories
Feature 'label' has 23 unique categories

Distribution of service categories:
http        40338
private     21853
domain_u     9043
smtp         7313
ftp_data     6860
Name: service, dtype: int64

Test set:
Feature 'protocol_type' has 3 unique categories
Feature 'service' has 64 unique categories
Feature 'flag' has 11 unique categories
Feature 'label' has 38 unique categories


In [59]:
# Get unique values for each categorical column
unique_protocol = sorted(df['protocol_type'].unique())
unique_service = sorted(df['service'].unique())
unique_flag = sorted(df['flag'].unique())

# Prefix each category with its feature name
protocol_columns = ['protocol_type_' + str(x) for x in unique_protocol]
service_columns = ['service_' + str(x) for x in unique_service]
flag_columns = ['flag_' + str(x) for x in unique_flag]

# Combine all new column names
one_hot_columns = protocol_columns + service_columns + flag_columns


# 6. Encode Categorical Features

In [60]:
categorical_columns = ['protocol_type', 'service', 'flag']

df_categorical_values = df[categorical_columns]
testdf_categorical_values = df_test[categorical_columns]

df_categorical_values_enc = df_categorical_values.apply(LabelEncoder().fit_transform)
testdf_categorical_values_enc = testdf_categorical_values.apply(LabelEncoder().fit_transform)

# Encode categorical features using OneHotEncoder
enc = OneHotEncoder()

# Fit and transform training categorical data
df_categorical_values_encenc = enc.fit_transform(df_categorical_values_enc)
df_cat_data = pd.DataFrame(df_categorical_values_encenc.toarray(), columns=one_hot_columns)

# Fit and transform test categorical data
testdf_categorical_values_encenc = enc.transform(testdf_categorical_values_enc)
testdf_cat_data = pd.DataFrame(testdf_categorical_values_encenc.toarray(), columns=one_hot_columns)

# Display first five rows of the updated dataframe
print(df_cat_data.head())



   protocol_type_icmp  protocol_type_tcp  protocol_type_udp  service_IRC  \
0                 0.0                1.0                0.0          0.0   
1                 0.0                0.0                1.0          0.0   
2                 0.0                1.0                0.0          0.0   
3                 0.0                1.0                0.0          0.0   
4                 0.0                1.0                0.0          0.0   

   service_X11  service_Z39_50  service_aol  service_auth  service_bgp  \
0          0.0             0.0          0.0           0.0          0.0   
1          0.0             0.0          0.0           0.0          0.0   
2          0.0             0.0          0.0           0.0          0.0   
3          0.0             0.0          0.0           0.0          0.0   
4          0.0             0.0          0.0           0.0          0.0   

   service_courier  ...  flag_REJ  flag_RSTO  flag_RSTOS0  flag_RSTR  flag_S0  \
0              0.

# Ensure consistent feature alignment

In [70]:
train_services = set(df['service'])
test_services = set(df_test['service'])
missing_services = list(train_services - test_services)
missing_service_cols = ['service_' + s for s in missing_services]

for col in missing_service_cols:
    testdf_cat_data[col] = 0

# 7. Merge Encoded Features


In [73]:
# Merge encoded categorical data with the original dataset
newdf = df.join(df_cat_data)
newdf.drop(['protocol_type', 'service', 'flag'], axis=1, inplace=True)

newdf_test = df_test.join(testdf_cat_data)
newdf_test.drop(['protocol_type', 'service', 'flag'], axis=1, inplace=True)

# Print the final shape of the dataset
print("Final Training Set Shape:", newdf.shape)
print("Final Test Set Shape:", newdf_test.shape)

Final Training Set Shape: (125973, 123)
Final Test Set Shape: (22544, 123)


In [74]:
# Check the first few rows of the processed dataset
print(newdf.head())

   duration  src_bytes  dst_bytes  land  wrong_fragment  urgent  hot  \
0         0        491          0     0               0       0    0   
1         0        146          0     0               0       0    0   
2         0          0          0     0               0       0    0   
3         0        232       8153     0               0       0    0   
4         0        199        420     0               0       0    0   

   num_failed_logins  logged_in  num_compromised  ...  flag_REJ  flag_RSTO  \
0                  0          0                0  ...       0.0        0.0   
1                  0          0                0  ...       0.0        0.0   
2                  0          0                0  ...       0.0        0.0   
3                  0          1                0  ...       0.0        0.0   
4                  0          1                0  ...       0.0        0.0   

   flag_RSTOS0  flag_RSTR  flag_S0  flag_S1  flag_S2  flag_S3  flag_SF  \
0          0.0        0.

# 8. Encode Labels into Attack Categories

In [75]:
attack_mapping = {
    'normal': 0, 
    'neptune': 1, 'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1, 'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
    'ipsweep': 2, 'nmap': 2, 'portsweep': 2, 'satan': 2, 'mscan': 2, 'saint': 2,
    'ftp_write': 3, 'guess_passwd': 3, 'imap': 3, 'multihop': 3, 'phf': 3, 'spy': 3, 'warezclient': 3, 'warezmaster': 3, 
    'sendmail': 3, 'named': 3, 'snmpgetattack': 3, 'snmpguess': 3, 'xlock': 3, 'xsnoop': 3, 'httptunnel': 3,
    'buffer_overflow': 4, 'loadmodule': 4, 'perl': 4, 'rootkit': 4, 'ps': 4, 'sqlattack': 4, 'xterm': 4
}

newdf['label'] = newdf['label'].replace(attack_mapping)
newdf_test['label'] = newdf_test['label'].replace(attack_mapping)

print(newdf['label'].head())

0    0
1    0
2    1
3    0
4    0
Name: label, dtype: int64


# 9. Normalize Data using Standard Scaler


In [76]:
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# Identify numeric columns (excluding the label column)
numeric_columns = newdf.select_dtypes(include=['number']).columns.tolist()
numeric_columns.remove('label')  # Exclude label column from scaling

# Standardize the training and test sets
newdf[numeric_columns] = scaler.fit_transform(newdf[numeric_columns])
newdf_test[numeric_columns] = scaler.transform(newdf_test[numeric_columns])

# Display first few rows to verify standardization
print(newdf.head())


   duration  src_bytes  dst_bytes      land  wrong_fragment    urgent  \
0 -0.110249  -0.007679  -0.004919 -0.014089       -0.089486 -0.007736   
1 -0.110249  -0.007737  -0.004919 -0.014089       -0.089486 -0.007736   
2 -0.110249  -0.007762  -0.004919 -0.014089       -0.089486 -0.007736   
3 -0.110249  -0.007723  -0.002891 -0.014089       -0.089486 -0.007736   
4 -0.110249  -0.007728  -0.004814 -0.014089       -0.089486 -0.007736   

        hot  num_failed_logins  logged_in  num_compromised  ...  flag_REJ  \
0 -0.095076          -0.027023  -0.809262        -0.011664  ... -0.312889   
1 -0.095076          -0.027023  -0.809262        -0.011664  ... -0.312889   
2 -0.095076          -0.027023  -0.809262        -0.011664  ... -0.312889   
3 -0.095076          -0.027023   1.235694        -0.011664  ... -0.312889   
4 -0.095076          -0.027023   1.235694        -0.011664  ... -0.312889   

   flag_RSTO  flag_RSTOS0  flag_RSTR   flag_S0   flag_S1   flag_S2   flag_S3  \
0   -0.11205    -0

# 10. Prepare Data for Training

In [79]:
multi_data = newdf.copy()
multi_data_test = newdf_test.copy()

y_train_multi = multi_data[['label']]
X_train_multi = multi_data.drop(columns=['label'])

y_test_multi = multi_data_test[['label']]
X_test_multi = multi_data_test.drop(columns=['label'])

print('X_train shape:', X_train_multi.shape, 'y_train shape:', y_train_multi.shape)
print('X_test shape:', X_test_multi.shape, 'y_test shape:', y_test_multi.shape)

X_train shape: (125973, 122) y_train shape: (125973, 1)
X_test shape: (22544, 122) y_test shape: (22544, 1)


# 11. Convert Labels to One-Hot Encoding

In [82]:
# Separate features and labels
X_train = newdf.drop(columns=['label'])
y_train = newdf['label']

X_test = newdf_test.drop(columns=['label'])
y_test = newdf_test['label']

print(f'X_train Shape: {X_train.shape}, y_train Shape: {y_train.shape}')
print(f'X_test Shape: {X_test.shape}, y_test Shape: {y_test.shape}')


X_train Shape: (125973, 122), y_train Shape: (125973,)
X_test Shape: (22544, 122), y_test Shape: (22544,)
