In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

## Process training dataset

In [21]:
df = pd.read_csv('../data/kdd_train.csv')

Remove unused features:
- num_outbound_cmds   (all zeros)
- difficulty_level    (annotation)
- attack_type         (label)
- dst_host_count      (right-saturated + large scale -> bad for log)
- dst_host_srv_count  (right-saturated + large scale -> bad for log)

In [22]:
attack_col = df['attack_type']
df.drop(columns=['num_outbound_cmds', 'difficulty_level', 'attack_type', 
                 'dst_host_count', 'dst_host_srv_count'], inplace=True)

Log transform features with large ranges:
- src_bytes
- dst_bytes
- duration
- num_compromised
- num_root
- srv_count
- count

In [None]:
# Log transform features with large ranges
# Using log1p [log(1 + x)] to handle zero values

log_features = ['src_bytes', 'dst_bytes', 'duration', 'num_compromised', 'num_root', 'srv_count', 'count']

for feature in log_features:
    df[feature] = np.log1p(df[feature])

Encode categorical features (low cardinality, one-hot):
- protocol_type

In [26]:
# One-hot encode protocol_type (3 unique values: tcp, udp, icmp)
# Using drop_first=True to avoid multicollinearity
# We only need two one-hot columns as the third will be deduced (when both are 0)

df = pd.get_dummies(df, columns=['protocol_type'], prefix='protocol', drop_first=True, dtype=int)

print(f"New columns: {[col for col in df.columns if col.startswith('protocol_')]}")

New columns: ['protocol_tcp', 'protocol_udp']


In [27]:
df.head()

Unnamed: 0,duration,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_tcp,protocol_udp
0,0.0,ftp_data,SF,6.198479,0.0,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,1,0
1,0.0,other,SF,4.990433,0.0,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0,1
2,0.0,private,S0,0.0,0.0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1,0
3,0.0,http,SF,5.451038,9.006264,0,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,1,0
4,0.0,http,SF,5.298317,6.042633,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0


Encode categorical features (high cardinality, frequency-encode):
- service
- flag

In [28]:
# Frequency encode high cardinality categorical features
# Replace each category with its frequency (count) in the dataset

# Frequency encode 'service' (70 unique values)
service_freq = df['service'].value_counts().to_dict()
df['service'] = df['service'].map(service_freq)

# Frequency encode 'flag' (11 unique values)
flag_freq = df['flag'].value_counts().to_dict()
df['flag'] = df['flag'].map(flag_freq)

print(f"Service sample values: {df['service'].head().tolist()}")
print(f"Flag sample values: {df['flag'].head().tolist()}")

Service sample values: [6860, 4359, 21853, 40338, 40338]
Flag sample values: [74945, 74945, 34851, 74945, 74945]


Normalize numeric features using StandardScaler.
Exclude one-hot encoded columns (protocol_tcp, protocol_udp) - they should stay as binary 0/1.

In [None]:
# Normalize all numeric features EXCEPT one-hot encoded columns
# One-hot encoded columns should remain as binary 0/1

# Get all numeric columns
all_numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()

# Exclude one-hot encoded columns from scaling
onehot_cols = ['protocol_tcp', 'protocol_udp']
features_to_scale = [col for col in all_numeric_features if col not in onehot_cols]

# Apply StandardScaler only to non-one-hot features
scaler = StandardScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

print(f"Normalized {len(features_to_scale)} features (excluded {len(onehot_cols)} one-hot columns)")
print(f"Scaled features: {features_to_scale}")
print(f"One-hot (not scaled): {onehot_cols}")

In [30]:
df.head(10)

Unnamed: 0,duration,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_tcp,protocol_udp
0,-0.221617,-0.728254,0.782064,0.995517,-0.869111,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,...,-0.782367,-0.280282,0.069972,-0.289103,-0.639532,-0.624871,-0.224532,-0.376387,0.476175,-0.367555
1,-0.221617,-0.881565,0.782064,0.590429,-0.869111,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,...,-1.16103,2.736852,2.367737,-0.289103,-0.639532,-0.624871,-0.387635,-0.376387,-2.100067,2.720684
2,-0.221617,0.190812,-0.813914,-1.08299,-0.869111,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,...,-0.938287,-0.174417,-0.480197,-0.289103,1.608759,1.618955,-0.387635,-0.376387,0.476175,-0.367555
3,-0.221617,1.323938,0.782064,0.744881,1.668642,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,...,1.066401,-0.439078,-0.383108,0.066252,-0.572083,-0.602433,-0.387635,-0.345084,0.476175,-0.367555
4,-0.221617,1.323938,0.782064,0.69367,0.83356,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,...,1.066401,-0.439078,-0.480197,-0.289103,-0.639532,-0.624871,-0.387635,-0.376387,0.476175,-0.367555
5,-0.221617,0.190812,-1.75405,-1.08299,-0.869111,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,...,-1.00511,-0.068553,-0.480197,-0.289103,-0.639532,-0.624871,2.87441,2.753914,0.476175,-0.367555
6,-0.221617,0.190812,-0.813914,-1.08299,-0.869111,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,...,-1.071933,-0.174417,-0.480197,-0.289103,1.608759,1.618955,-0.387635,-0.376387,0.476175,-0.367555
7,-0.221617,0.190812,-0.813914,-1.08299,-0.869111,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,...,-1.027384,-0.068553,-0.480197,-0.289103,1.608759,1.618955,-0.387635,-0.376387,0.476175,-0.367555
8,-0.221617,-1.143989,-0.813914,-1.08299,-0.869111,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,...,-0.960561,-0.174417,-0.480197,-0.289103,1.608759,1.618955,-0.387635,-0.376387,0.476175,-0.367555
9,-0.221617,0.190812,-0.813914,-1.08299,-0.869111,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,...,-1.049659,-0.121485,-0.480197,-0.289103,1.608759,1.618955,-0.387635,-0.376387,0.476175,-0.367555


In [31]:
# Calculate variance for all 31 features post-normalization
feature_variances = df.var().sort_values(ascending=False)

print(f"Total features: {len(feature_variances)}")
print(f"\nVariance for each feature (sorted by variance):\n")
for feature, variance in feature_variances.items():
    print(f"{feature:35s} : {variance:.6f}")

Total features: 39

Variance for each feature (sorted by variance):

num_shells                          : 1.000008
duration                            : 1.000008
num_file_creations                  : 1.000008
protocol_tcp                        : 1.000008
dst_host_srv_rerror_rate            : 1.000008
dst_host_srv_serror_rate            : 1.000008
dst_host_srv_diff_host_rate         : 1.000008
dst_host_same_src_port_rate         : 1.000008
srv_diff_host_rate                  : 1.000008
same_srv_rate                       : 1.000008
srv_serror_rate                     : 1.000008
serror_rate                         : 1.000008
is_guest_login                      : 1.000008
urgent                              : 1.000008
su_attempted                        : 1.000008
rerror_rate                         : 1.000008
flag                                : 1.000008
src_bytes                           : 1.000008
dst_bytes                           : 1.000008
land                                : 

Drop lowest variance features (they won't contribute much to the model)

In [32]:
df.drop(columns= ['su_attempted', 'root_shell', 'land', 'is_host_login'], inplace= True)

In [33]:
len(df.columns)

35

In [34]:
df.shape

(125973, 35)

In [35]:
# For readability we re-append the attack_type column
df['attack_type'] = attack_col

In [36]:
df.sample(10)

Unnamed: 0,duration,service,flag,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,logged_in,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_tcp,protocol_udp,attack_type
99484,-0.221617,0.190812,-0.813914,-1.08299,-0.869111,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,...,-0.068553,-0.480197,-0.289103,1.608759,1.618955,-0.387635,-0.376387,0.476175,-0.367555,neptune
52136,-0.221617,-1.108681,-0.813914,-1.08299,-0.869111,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,...,-0.121485,-0.480197,-0.289103,1.608759,1.618955,-0.387635,-0.376387,0.476175,-0.367555,neptune
13996,-0.221617,-0.728254,0.782064,0.866635,-0.869111,-0.089486,-0.007736,-0.095076,-0.027023,1.235694,...,-0.439078,2.756092,1.665351,-0.639532,-0.624871,-0.387635,-0.376387,0.476175,-0.367555,warezclient
59792,-0.221617,1.323938,0.782064,0.830749,0.691179,-0.089486,-0.007736,-0.095076,-0.027023,1.235694,...,-0.439078,-0.480197,-0.289103,-0.639532,-0.624871,-0.387635,-0.376387,0.476175,-0.367555,normal
16877,-0.221617,1.323938,0.782064,0.808885,1.166654,-0.089486,-0.007736,-0.095076,-0.027023,1.235694,...,-0.439078,-0.480197,-0.289103,-0.639532,-0.624871,-0.387635,-0.376387,0.476175,-0.367555,normal
121803,-0.221617,-0.728254,-0.813914,-1.08299,-0.869111,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,...,-0.174417,-0.480197,-0.289103,1.608759,1.618955,-0.387635,-0.376387,0.476175,-0.367555,neptune
33294,-0.221617,-0.728254,0.782064,1.066504,-0.869111,-0.089486,-0.007736,-0.095076,-0.027023,1.235694,...,-0.015621,0.94377,-0.289103,-0.617049,-0.579994,-0.387635,-0.376387,0.476175,-0.367555,normal
12000,-0.221617,-1.113891,-1.75405,-1.08299,-0.869111,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,...,-0.015621,-0.480197,-0.289103,-0.639532,-0.624871,2.87441,2.753914,0.476175,-0.367555,neptune
47825,-0.221617,-1.106535,-0.813914,-1.08299,-0.869111,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,...,-0.068553,-0.480197,-0.289103,1.608759,1.618955,-0.387635,-0.376387,0.476175,-0.367555,neptune
53396,-0.221617,0.190812,-1.75405,-1.08299,-0.869111,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,...,1.254751,0.458327,-0.289103,-0.639532,-0.624871,0.590979,2.753914,0.476175,-0.367555,portsweep


Preprocessing complete! Now save the preprocessed csv

In [37]:
df.to_csv('../data/preproc_kdd_train.csv',index= False)

## Process test dataset
Apply the same transformations to the test dataset using the mappings and scaler from training

In [38]:
# Load test dataset
df_test = pd.read_csv('../data/kdd_test.csv')

# Save attack column
attack_col_test = df_test['attack_type']

# Drop same columns as training
df_test.drop(columns=['num_outbound_cmds', 'difficulty_level', 'attack_type', 
                      'dst_host_count', 'dst_host_srv_count'], inplace=True)

# Apply log transforms (same features as training)
log_features = ['src_bytes', 'dst_bytes', 'duration', 'num_compromised', 'num_root', 'srv_count', 'count']
for feature in log_features:
    df_test[feature] = np.log1p(df_test[feature])

print(f'Test dataset shape after feature engineering: {df_test.shape}')

Test dataset shape after feature engineering: (22544, 38)


In [39]:
# One-hot encode protocol_type (same as training)
df_test = pd.get_dummies(df_test, columns=['protocol_type'], prefix='protocol', drop_first=True, dtype=int)

# Frequency encode using TRAINING frequency mappings
df_test['service'] = df_test['service'].map(service_freq).fillna(0)  # fillna for unseen categories
df_test['flag'] = df_test['flag'].map(flag_freq).fillna(0)  # fillna for unseen categories

print(f'Test dataset shape after encoding: {df_test.shape}')

Test dataset shape after encoding: (22544, 39)


In [None]:
# Normalize using the TRAINING scaler (transform only, don't fit)
# Use the same features_to_scale list from training (exclude one-hot columns)

df_test[features_to_scale] = scaler.transform(df_test[features_to_scale])

print(f'Normalized {len(features_to_scale)} features in test dataset (one-hot columns remain binary)')

In [41]:
# Drop same low variance features as training
df_test.drop(columns=['su_attempted', 'root_shell', 'land', 'is_host_login'], inplace=True)

# Add back attack_type column
df_test['attack_type'] = attack_col_test

print(f'Final test dataset shape: {df_test.shape}')
print(f'Test columns: {df_test.columns.tolist()}')

Final test dataset shape: (22544, 36)
Test columns: ['duration', 'service', 'flag', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'protocol_tcp', 'protocol_udp', 'attack_type']


In [42]:
# Save preprocessed test dataset
df_test.to_csv('../data/preproc_kdd_test.csv', index=False)
print('Saved preprocessed test dataset to ../data/preproc_kdd_test.csv')

Saved preprocessed test dataset to ../data/preproc_kdd_test.csv
