# Week 2: Data Cleaning & Preprocessing
Objective:
- Clean the NSL-KDD dataset
- Convert categorical features into numerical format
- Handle class imbalance awareness
- Prevent data leakage
- Prepare dataset for model training
This notebook prepares the dataset for supervised learning.

In [1]:
import pandas as pd
import numpy as np
import os
import requests

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import joblib

In [2]:
# Ensure data directory exists
os.makedirs("data", exist_ok=True)

train_path = os.path.join("data", "KDDTrain+.txt")

# If dataset does not exist, download it
if not os.path.exists(train_path):
    print("Dataset not found. Downloading...")
    
    url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt"
    response = requests.get(url)
    
    with open(train_path, "wb") as f:
        f.write(response.content)
    
    print("Download complete.")
else:
    print("Dataset already exists.")


Dataset already exists.


In [3]:
DATA_DIR = "data"
train_path = os.path.join(DATA_DIR, 'KDDTrain+.txt')

COLUMNS = [
'duration','protocol_type','service','flag','src_bytes','dst_bytes','land',
'wrong_fragment','urgent','hot','num_failed_logins','logged_in',
'num_compromised','root_shell','su_attempted','num_root','num_file_creations',
'num_shells','num_access_files','num_outbound_cmds','is_host_login',
'is_guest_login','count','srv_count','serror_rate','srv_serror_rate',
'rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate',
'srv_diff_host_rate','dst_host_count','dst_host_srv_count',
'dst_host_same_srv_rate','dst_host_diff_srv_rate',
'dst_host_same_src_port_rate','dst_host_srv_diff_host_rate',
'dst_host_serror_rate','dst_host_srv_serror_rate',
'dst_host_rerror_rate','dst_host_srv_rerror_rate','class','difficulty_level'
]

df = pd.read_csv(train_path, names=COLUMNS)

print("Dataset Shape:", df.shape)

Dataset Shape: (125973, 43)


In [4]:
duplicates = df.duplicated().sum()
print("Duplicate rows before removal:", duplicates)

df = df.drop_duplicates()

print("Shape after removing duplicates:", df.shape)

Duplicate rows before removal: 0
Shape after removing duplicates: (125973, 43)


In [5]:
df['binary_class'] = df['class'].apply(lambda x: 0 if x == 'normal' else 1)

print(df['binary_class'].value_counts())

binary_class
0    67343
1    58630
Name: count, dtype: int64


In [6]:
X = df.drop(['class','binary_class','difficulty_level'], axis=1)
y = df['binary_class']

In [7]:
categorical_cols = X.select_dtypes(include=['object', 'string']).columns
numerical_cols = X.select_dtypes(exclude=['object', 'string']).columns
print("Categorical columns:", list(categorical_cols))
print("Numerical columns:", list(numerical_cols))

Categorical columns: ['protocol_type', 'service', 'flag']
Numerical columns: ['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Maintains class distribution
)

print("Training shape:", X_train.shape)
print("Testing shape:", X_test.shape)

Training shape: (100778, 41)
Testing shape: (25195, 41)


In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("Processed Training Shape:", X_train_processed.shape)
print("Processed Testing Shape:", X_test_processed.shape)

Processed Training Shape: (100778, 121)
Processed Testing Shape: (25195, 121)


In [10]:
print("Original Training Shape:", X_train.shape)
print("Processed Training Shape:", X_train_processed.shape)


Original Training Shape: (100778, 41)
Processed Training Shape: (100778, 121)


In [11]:
os.makedirs("models", exist_ok=True)
joblib.dump(preprocessor, "models/preprocessor.pkl")


['models/preprocessor.pkl']

In [12]:
class_distribution = y_train.value_counts(normalize=True) * 100
print("Class distribution in training set (%):")
print(class_distribution)


Class distribution in training set (%):
binary_class
0    53.458096
1    46.541904
Name: proportion, dtype: float64


In [13]:
import sys
print(sys.executable)


C:\Python314\python.exe


In [14]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(
    X_train_processed, y_train
)

print("Resampled Training Shape:", X_train_resampled.shape)
print("Resampled Class Distribution:")
print(pd.Series(y_train_resampled).value_counts())


Resampled Training Shape: (107748, 121)
Resampled Class Distribution:
binary_class
0    53874
1    53874
Name: count, dtype: int64


## Week 2 Summary
- Removed duplicate records to prevent training bias.
- Converted multi-class attack labels into binary classification.
- Performed stratified train-test split to maintain class balance.
- Built a preprocessing pipeline using ColumnTransformer.
- Applied scaling to numerical features and encoding to categorical features.
- Ensured no data leakage by fitting transformations only on training data.
- Applied SMOTE to address class imbalance (training data only).
The dataset is now fully prepared for supervised model training.