In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, OneHotEncoder, PowerTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold
# from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFE, MutualInfoClassifier
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer

In [2]:
# ================================
# NSL-KDD Dataset Loading Script
# ================================

import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# --------------------
# Suppress warnings
# --------------------
warnings.filterwarnings('ignore')

# --------------------
# Plot styling
# --------------------
plt.style.use('ggplot')
sns.set_palette("husl")

# --------------------
# Column Names (NSL-KDD)
# --------------------
COL_NAMES = [
    "duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate",
    "dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate",
    "label", "difficulty_level"
]

# --------------------
# File Paths (Windows Safe)
# --------------------
train_path = r"C:\Users\shrir\Desktop\SentinelNet-AI-Powered-Network-Intrusion-Detection-System-\data\KDDTrain+.txt"
test_path  = r"C:\Users\shrir\Desktop\SentinelNet-AI-Powered-Network-Intrusion-Detection-System-\data\KDDTest+.txt"

# --------------------
# Load Data with Error Handling
# --------------------
try:
    df_train = pd.read_csv(train_path, header=None, names=COL_NAMES)
    df_test  = pd.read_csv(test_path, header=None, names=COL_NAMES)

    print("‚úÖ Files Loaded Successfully!")
    print("Training Data Shape:", df_train.shape)
    print("Test Data Shape:", df_test.shape)

except FileNotFoundError:
    print("‚ùå Files not found. Trying local directory...")

    if os.path.exists("KDDTrain+.txt") and os.path.exists("KDDTest+.txt"):
        df_train = pd.read_csv("KDDTrain+.txt", header=None, names=COL_NAMES)
        df_test  = pd.read_csv("KDDTest+.txt", header=None, names=COL_NAMES)

        print("‚úÖ Loaded from local directory!")
        print("Training Data Shape:", df_train.shape)
        print("Test Data Shape:", df_test.shape)
    else:
        raise FileNotFoundError("Dataset files not found. Please check the path.")

# --------------------
# Combine Train & Test (Optional)
# --------------------
df_combined = pd.concat([df_train, df_test], axis=0, ignore_index=True)

print("\nüìä Combined Dataset Shape:", df_combined.shape)
print("\nFirst 5 rows:")
print(df_combined.head())

# --------------------
# Basic Info
# --------------------
print("\nDataset Info:")
print(df_combined.info())

print("\nLabel Distribution:")
print(df_combined['label'].value_counts())

‚úÖ Files Loaded Successfully!
Training Data Shape: (125973, 43)
Test Data Shape: (22544, 43)

üìä Combined Dataset Shape: (148517, 43)

First 5 rows:
   duration protocol_type   service flag  src_bytes  dst_bytes  land  \
0         0           tcp  ftp_data   SF        491          0     0   
1         0           udp     other   SF        146          0     0   
2         0           tcp   private   S0          0          0     0   
3         0           tcp      http   SF        232       8153     0   
4         0           tcp      http   SF        199        420     0   

   wrong_fragment  urgent  hot  ...  dst_host_same_srv_rate  \
0               0       0    0  ...                    0.17   
1               0       0    0  ...                    0.00   
2               0       0    0  ...                    0.10   
3               0       0    0  ...                    1.00   
4               0       0    0  ...                    1.00   

   dst_host_diff_srv_rate  dst_host_

In [3]:
exclude_cols = ['label', 'attack_class', 'difficulty_level', 'is_outlier']

In [10]:
# ==================================
# SentinelNet ML Preprocessing Pipeline
# ==================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# -----------------------------
# 1Ô∏è‚É£ Combine Train + Test
# -----------------------------
df = pd.concat([df_train, df_test], ignore_index=True)

# -----------------------------
# 2Ô∏è‚É£ Create Attack Class Mapping (if not created)
# -----------------------------
attack_mapping = {
    'normal': 'normal',

    # DoS
    'back': 'DoS', 'land': 'DoS', 'neptune': 'DoS', 'pod': 'DoS',
    'smurf': 'DoS', 'teardrop': 'DoS', 'mailbomb': 'DoS',
    'apache2': 'DoS', 'processtable': 'DoS', 'udpstorm': 'DoS',

    # Probe
    'ipsweep': 'Probe', 'nmap': 'Probe', 'portsweep': 'Probe',
    'satan': 'Probe', 'mscan': 'Probe', 'saint': 'Probe',

    # R2L
    'ftp_write': 'R2L', 'guess_passwd': 'R2L', 'imap': 'R2L',
    'multihop': 'R2L', 'phf': 'R2L', 'spy': 'R2L',
    'warezclient': 'R2L', 'warezmaster': 'R2L',
    'sendmail': 'R2L', 'named': 'R2L', 'snmpgetattack': 'R2L',
    'snmpguess': 'R2L', 'xlock': 'R2L', 'xsnoop': 'R2L',
    'worm': 'R2L',

    # U2R
    'buffer_overflow': 'U2R', 'loadmodule': 'U2R',
    'perl': 'U2R', 'rootkit': 'U2R',
    'httptunnel': 'U2R', 'ps': 'U2R',
    'sqlattack': 'U2R', 'xterm': 'U2R'
}

df['attack_class'] = df['label'].map(attack_mapping)

# -----------------------------
# 3Ô∏è‚É£ Encode Categorical Features
# -----------------------------
categorical_cols = ['protocol_type', 'service', 'flag']

df = pd.get_dummies(df, columns=categorical_cols)

# -----------------------------
# 4Ô∏è‚É£ Encode Target
# -----------------------------
le = LabelEncoder()
df['attack_class'] = le.fit_transform(df['attack_class'])

# -----------------------------
# 5Ô∏è‚É£ Drop Unnecessary Columns
# -----------------------------
exclude_cols = ['label', 'difficulty_level']
X = df.drop(columns=exclude_cols)
y = df['attack_class']

# -----------------------------
# 6Ô∏è‚É£ Handle Missing Values
# -----------------------------
X = X.fillna(0)

# -----------------------------
# 7Ô∏è‚É£ Scale Features
# -----------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -----------------------------
# 8Ô∏è‚É£ Apply SMOTE (Balance Classes)
# -----------------------------
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

print("Before SMOTE:\n", y.value_counts())
print("\nAfter SMOTE:\n", pd.Series(y_resampled).value_counts())

# -----------------------------
# 9Ô∏è‚É£ Train-Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled,
    test_size=0.2,
    random_state=42,
    stratify=y_resampled
)

print("\nFinal Shapes:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)

Before SMOTE:
 attack_class
4    77054
0    53385
1    14077
2     3749
3      252
Name: count, dtype: int64

After SMOTE:
 attack_class
4    77054
0    77054
2    77054
1    77054
3    77054
Name: count, dtype: int64

Final Shapes:
X_train: (308216, 123)
X_test: (77054, 123)
