In [84]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [85]:
df = pd.read_csv('../data/kdd_train.csv')

Remove unused features:
- num_outbound_cmds   (all zeros)
- difficulty_level    (annotation)
- attack_type         (label)
- dst_host_count      (right-saturated + large scale -> bad for log)
- dst_host_srv_count  (right-saturated + large scale -> bad for log)

In [86]:
attack_col = df['attack_type']
df.drop(columns=['num_outbound_cmds', 'difficulty_level', 'attack_type', 
                 'dst_host_count', 'dst_host_srv_count'], inplace=True)

Handle highly correlated features:  
  Highly correlated feature pairs (|r| > 0.8):  
  num_compromised                     <-> num_root                           :  0.9988  
  serror_rate                         <-> srv_serror_rate                    :  0.9933  
  rerror_rate                         <-> srv_rerror_rate                    :  0.9890  
  srv_serror_rate                     <-> dst_host_srv_serror_rate           :  0.9863  
  dst_host_serror_rate                <-> dst_host_srv_serror_rate           :  0.9851  
  serror_rate                         <-> dst_host_srv_serror_rate           :  0.9811  
  serror_rate                         <-> dst_host_serror_rate               :  0.9794  
  srv_serror_rate                     <-> dst_host_serror_rate               :  0.9776  
  srv_rerror_rate                     <-> dst_host_srv_rerror_rate           :  0.9702  
  rerror_rate                         <-> dst_host_srv_rerror_rate           :  0.9644  
  rerror_rate                         <-> dst_host_rerror_rate               :  0.9267  
  dst_host_rerror_rate                <-> dst_host_srv_rerror_rate           :  0.9247  
  srv_rerror_rate                     <-> dst_host_rerror_rate               :  0.9178  
  dst_host_srv_count                  <-> dst_host_same_srv_rate             :  0.8967  
  hot                                 <-> is_guest_login                     :  0.8603  

In [87]:
# Correlation cluster 1: num_compromised <-> num_root (0.9988)
#   Keep: num_root (more specific - root shell access)
#   Drop: num_compromised
#
# Correlation cluster 2: serror (SYN errors) - 4 features highly correlated
#   serror_rate, srv_serror_rate, dst_host_serror_rate, dst_host_srv_serror_rate
#   Keep: srv_serror_rate (service-level, important for DoS detection)
#   Drop: serror_rate, dst_host_serror_rate, dst_host_srv_serror_rate
#
# Correlation cluster 3: rerror (REJ errors) - 4 features highly correlated  
#   rerror_rate, srv_rerror_rate, dst_host_rerror_rate, dst_host_srv_rerror_rate
#   Keep: srv_rerror_rate (service-level, important for DoS detection)
#   Drop: rerror_rate, dst_host_rerror_rate, dst_host_srv_rerror_rate
#
# Correlation cluster 4: dst_host_srv_count <-> dst_host_same_srv_rate (0.8967)
#   Note: dst_host_srv_count was already dropped in previous cell
#   No action needed
#
# Correlation cluster 5: hot <-> is_guest_login (0.8603)
#   Keep: hot (direct attack indicator - system file access)
#   Drop: is_guest_login

df.drop(columns=[
    'num_compromised',           # Redundant with num_root
    'serror_rate',               # Redundant with srv_serror_rate  
    'dst_host_serror_rate',      # Redundant with srv_serror_rate
    'dst_host_srv_serror_rate',  # Redundant with srv_serror_rate
    'rerror_rate',               # Redundant with srv_rerror_rate
    'dst_host_rerror_rate',      # Redundant with srv_rerror_rate
    'dst_host_srv_rerror_rate',  # Redundant with srv_rerror_rate
    'is_guest_login'             # Redundant with hot
], inplace=True)

Log transform features with large ranges:
- src_bytes
- dst_bytes
- duration
- num_compromised
- num_root
- srv_count
- count

In [88]:
# Log transform features with large ranges
# Using log1p (log(1 + x)) to handle zero values
# Note: num_compromised was dropped in previous step due to correlation with num_root

log_features = ['src_bytes', 'dst_bytes', 'duration', 'num_root', 'srv_count', 'count']

for feature in log_features:
    df[feature] = np.log1p(df[feature])

Normalize using StandardScalar
- All numeric features except "_rate" (already scaled) and binary features

In [89]:
# Identify features to normalize
# Exclude: categorical (protocol_type, service, flag), rate features (already 0-1), and binary features

# Binary features (0/1)
binary_features = ['land', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login']

# Rate features (already scaled 0-1)
rate_features = ['srv_serror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 
                 'srv_diff_host_rate', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
                 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate']

# Categorical features
categorical_features = ['protocol_type', 'service', 'flag']

# Get all numeric features
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()

# Features to normalize: numeric - binary - rate
features_to_normalize = [f for f in numeric_features 
                         if f not in binary_features and f not in rate_features]

# Apply StandardScaler
scaler = StandardScaler()
df[features_to_normalize] = scaler.fit_transform(df[features_to_normalize])

print(f"Normalized {len(features_to_normalize)} features: {features_to_normalize}")

Normalized 13 features: ['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'count', 'srv_count']


In [90]:
df.head(10)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,srv_count,srv_serror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate
0,-0.221617,tcp,ftp_data,SF,0.995517,-0.869111,0,-0.089486,-0.007736,-0.095076,...,-0.854523,0.0,0.0,1.0,0.0,0.0,0.17,0.03,0.17,0.0
1,-0.221617,udp,other,SF,0.590429,-0.869111,0,-0.089486,-0.007736,-0.095076,...,-1.166217,0.0,0.0,0.08,0.15,0.0,0.0,0.6,0.88,0.0
2,-0.221617,tcp,private,S0,-1.08299,-0.869111,0,-0.089486,-0.007736,-0.095076,...,-0.203178,1.0,0.0,0.05,0.07,0.0,0.1,0.05,0.0,0.0
3,-0.221617,tcp,http,SF,0.744881,1.668642,0,-0.089486,-0.007736,-0.095076,...,-0.321678,0.2,0.0,1.0,0.0,0.0,1.0,0.0,0.03,0.04
4,-0.221617,tcp,http,SF,0.69367,0.83356,0,-0.089486,-0.007736,-0.095076,...,0.988816,0.0,0.0,1.0,0.0,0.09,1.0,0.0,0.0,0.0
5,-0.221617,tcp,private,REJ,-1.08299,-0.869111,0,-0.089486,-0.007736,-0.095076,...,0.603854,0.0,1.0,0.16,0.06,0.0,0.07,0.07,0.0,0.0
6,-0.221617,tcp,private,S0,-1.08299,-0.869111,0,-0.089486,-0.007736,-0.095076,...,0.07101,1.0,0.0,0.05,0.06,0.0,0.04,0.05,0.0,0.0
7,-0.221617,tcp,private,S0,-1.08299,-0.869111,0,-0.089486,-0.007736,-0.095076,...,0.47892,1.0,0.0,0.14,0.06,0.0,0.06,0.07,0.0,0.0
8,-0.221617,tcp,remote_job,S0,-1.08299,-0.869111,0,-0.089486,-0.007736,-0.095076,...,0.74401,1.0,0.0,0.09,0.05,0.0,0.09,0.05,0.0,0.0
9,-0.221617,tcp,private,S0,-1.08299,-0.869111,0,-0.089486,-0.007736,-0.095076,...,-0.009984,1.0,0.0,0.06,0.06,0.0,0.05,0.06,0.0,0.0


Encode categorical features (low cardinality, one-hot):
- protocol_type

In [91]:
# One-hot encode protocol_type (3 unique values: tcp, udp, icmp)
# Using drop_first=True to avoid multicollinearity (dummy variable trap)
# We only need two one-hot columns as the third will be deduced (when both are 0)

df = pd.get_dummies(df, columns=['protocol_type'], prefix='protocol', drop_first=True, dtype=int)

print(f"New columns: {[col for col in df.columns if col.startswith('protocol_')]}")

New columns: ['protocol_tcp', 'protocol_udp']


In [92]:
df.head()

Unnamed: 0,duration,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,protocol_tcp,protocol_udp
0,-0.221617,ftp_data,SF,0.995517,-0.869111,0,-0.089486,-0.007736,-0.095076,-0.027023,...,0.0,1.0,0.0,0.0,0.17,0.03,0.17,0.0,1,0
1,-0.221617,other,SF,0.590429,-0.869111,0,-0.089486,-0.007736,-0.095076,-0.027023,...,0.0,0.08,0.15,0.0,0.0,0.6,0.88,0.0,0,1
2,-0.221617,private,S0,-1.08299,-0.869111,0,-0.089486,-0.007736,-0.095076,-0.027023,...,0.0,0.05,0.07,0.0,0.1,0.05,0.0,0.0,1,0
3,-0.221617,http,SF,0.744881,1.668642,0,-0.089486,-0.007736,-0.095076,-0.027023,...,0.0,1.0,0.0,0.0,1.0,0.0,0.03,0.04,1,0
4,-0.221617,http,SF,0.69367,0.83356,0,-0.089486,-0.007736,-0.095076,-0.027023,...,0.0,1.0,0.0,0.09,1.0,0.0,0.0,0.0,1,0


Encode categorical features (high cardinality, frequency-encode):
- service
- flag

In [93]:
# Frequency encode high cardinality categorical features
# Replace each category with its frequency (count) in the dataset
# Then normalize using StandardScaler

# Frequency encode 'service' (70 unique values)
service_freq = df['service'].value_counts().to_dict()
df['service'] = df['service'].map(service_freq)

# Frequency encode 'flag' (11 unique values)
flag_freq = df['flag'].value_counts().to_dict()
df['flag'] = df['flag'].map(flag_freq)

# Normalize the frequency-encoded features
freq_scaler = StandardScaler()
df[['service', 'flag']] = freq_scaler.fit_transform(df[['service', 'flag']])

print(f"Service sample values: {df['service'].head().tolist()}")
print(f"Flag sample values: {df['flag'].head().tolist()}")

Service sample values: [-0.7282544225205467, -0.8815650420624268, 0.19081239826770158, 1.3239378689720815, 1.3239378689720815]
Flag sample values: [0.7820636339392766, 0.7820636339392766, -0.813914405734298, 0.7820636339392766, 0.7820636339392766]


In [94]:
df.head(10)

Unnamed: 0,duration,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,protocol_tcp,protocol_udp
0,-0.221617,-0.728254,0.782064,0.995517,-0.869111,0,-0.089486,-0.007736,-0.095076,-0.027023,...,0.0,1.0,0.0,0.0,0.17,0.03,0.17,0.0,1,0
1,-0.221617,-0.881565,0.782064,0.590429,-0.869111,0,-0.089486,-0.007736,-0.095076,-0.027023,...,0.0,0.08,0.15,0.0,0.0,0.6,0.88,0.0,0,1
2,-0.221617,0.190812,-0.813914,-1.08299,-0.869111,0,-0.089486,-0.007736,-0.095076,-0.027023,...,0.0,0.05,0.07,0.0,0.1,0.05,0.0,0.0,1,0
3,-0.221617,1.323938,0.782064,0.744881,1.668642,0,-0.089486,-0.007736,-0.095076,-0.027023,...,0.0,1.0,0.0,0.0,1.0,0.0,0.03,0.04,1,0
4,-0.221617,1.323938,0.782064,0.69367,0.83356,0,-0.089486,-0.007736,-0.095076,-0.027023,...,0.0,1.0,0.0,0.09,1.0,0.0,0.0,0.0,1,0
5,-0.221617,0.190812,-1.75405,-1.08299,-0.869111,0,-0.089486,-0.007736,-0.095076,-0.027023,...,1.0,0.16,0.06,0.0,0.07,0.07,0.0,0.0,1,0
6,-0.221617,0.190812,-0.813914,-1.08299,-0.869111,0,-0.089486,-0.007736,-0.095076,-0.027023,...,0.0,0.05,0.06,0.0,0.04,0.05,0.0,0.0,1,0
7,-0.221617,0.190812,-0.813914,-1.08299,-0.869111,0,-0.089486,-0.007736,-0.095076,-0.027023,...,0.0,0.14,0.06,0.0,0.06,0.07,0.0,0.0,1,0
8,-0.221617,-1.143989,-0.813914,-1.08299,-0.869111,0,-0.089486,-0.007736,-0.095076,-0.027023,...,0.0,0.09,0.05,0.0,0.09,0.05,0.0,0.0,1,0
9,-0.221617,0.190812,-0.813914,-1.08299,-0.869111,0,-0.089486,-0.007736,-0.095076,-0.027023,...,0.0,0.06,0.06,0.0,0.05,0.06,0.0,0.0,1,0


In [95]:
# Calculate variance for all 31 features post-normalization
feature_variances = df.var().sort_values(ascending=False)

print(f"Total features: {len(feature_variances)}")
print(f"\nVariance for each feature (sorted by variance):\n")
for feature, variance in feature_variances.items():
    print(f"{feature:35s} : {variance:.6f}")

Total features: 31

Variance for each feature (sorted by variance):

num_shells                          : 1.000008
urgent                              : 1.000008
num_file_creations                  : 1.000008
duration                            : 1.000008
dst_bytes                           : 1.000008
src_bytes                           : 1.000008
num_failed_logins                   : 1.000008
num_root                            : 1.000008
flag                                : 1.000008
service                             : 1.000008
count                               : 1.000008
srv_count                           : 1.000008
hot                                 : 1.000008
num_access_files                    : 1.000008
wrong_fragment                      : 1.000008
logged_in                           : 0.239131
dst_host_same_srv_rate              : 0.201556
srv_serror_rate                     : 0.199829
same_srv_rate                       : 0.193268
protocol_tcp                        : 

Drop lowest variance features (they won't contribute much to the model)

In [96]:
df.drop(columns= ['su_attempted', 'root_shell', 'land', 'is_host_login'], inplace= True)

In [97]:
len(df.columns)

27

In [98]:
df.shape

(125973, 27)

In [99]:
# For readability we re-append the attack_type column
df['attack_type'] = attack_col

In [100]:
df.sample(10)

Unnamed: 0,duration,service,flag,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,logged_in,...,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,protocol_tcp,protocol_udp,attack_type
71409,-0.221617,1.323938,-0.813914,-1.08299,-0.869111,-0.089486,-0.007736,-0.095076,-0.027023,0,...,0.33,0.33,0.0,0.23,0.04,0.0,0.0,1,0,neptune
42159,-0.221617,-1.12658,-1.75405,-1.08299,-0.869111,-0.089486,-0.007736,-0.095076,-0.027023,0,...,0.01,0.08,0.0,0.0,0.08,0.0,0.0,1,0,neptune
66829,-0.221617,-0.86765,0.782064,-0.346205,-0.869111,-0.089486,-0.007736,-0.095076,-0.027023,0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.51,0,0,ipsweep
92852,-0.221617,-1.115362,-0.813914,-1.08299,-0.869111,-0.089486,-0.007736,-0.095076,-0.027023,0,...,0.08,0.06,0.0,0.07,0.08,0.0,0.0,1,0,neptune
92726,-0.221617,-0.881565,0.782064,0.592702,-0.869111,-0.089486,-0.007736,-0.095076,-0.027023,0,...,0.2,0.6,0.0,0.0,0.1,0.15,0.0,0,1,normal
88233,-0.221617,-0.594437,0.782064,0.200851,0.20971,-0.089486,-0.007736,-0.095076,-0.027023,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1,normal
3659,-0.221617,1.323938,0.782064,0.878437,1.124685,-0.089486,-0.007736,-0.095076,-0.027023,1,...,1.0,0.0,0.0,1.0,0.0,0.02,0.04,1,0,normal
2482,-0.221617,0.190812,-0.813914,-1.08299,-0.869111,-0.089486,-0.007736,-0.095076,-0.027023,0,...,0.03,0.06,0.0,0.06,0.06,0.0,0.0,1,0,neptune
62395,0.88729,-0.728254,0.782064,1.172084,-0.869111,-0.089486,-0.007736,-0.095076,-0.027023,1,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1,0,warezclient
8144,-0.221617,-1.004532,-0.813914,-1.08299,-0.869111,-0.089486,-0.007736,-0.095076,-0.027023,0,...,0.06,0.06,0.0,0.06,0.07,0.0,0.0,1,0,neptune


Much better! We went from 43 columns to 27 via preprocessing. Now save the preprocessed csv

In [101]:
df.to_csv('../data/preproc_kdd_train.csv',index= False)