In [1]:
import pandas as pd
import numpy as np

## Process training dataset


In [2]:
df = pd.read_csv('../data/kdd_train.csv')

Remove unused features:

- num_outbound_cmds (all zeros)
- difficulty_level (annotation)
- attack_type (label)
- dst_host_count (right-saturated + large scale -> bad for log)
- dst_host_srv_count (right-saturated + large scale -> bad for log)


In [3]:
# Save attack_type column and drop unused features
attack_col = df['attack_type']
df.drop(
  columns=[
    'num_outbound_cmds',
    'difficulty_level',
    'attack_type',
    'dst_host_count',
    'dst_host_srv_count',
  ],
  inplace=True,
)

Log transform features with large ranges:

- src_bytes
- dst_bytes
- duration
- num_compromised
- num_root
- srv_count
- count


In [4]:
# Log transform features with large/skewed ranges using log1p to handle zeros
log_features = [
  'src_bytes',
  'dst_bytes',
  'duration',
  'num_compromised',
  'num_root',
  'srv_count',
  'count',
]

for feature in log_features:
  df[feature] = np.log1p(df[feature])

Encode categorical features (low cardinality, one-hot):

- protocol_type


In [5]:
# One-hot encode protocol_type (drop_first=True to avoid multicollinearity)
df = pd.get_dummies(
  df, columns=['protocol_type'], prefix='protocol', drop_first=True, dtype=int
)
print(f'One-hot columns: {[col for col in df.columns if col.startswith("protocol_")]}')

One-hot columns: ['protocol_tcp', 'protocol_udp']


In [6]:
df.head()

Unnamed: 0,duration,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_tcp,protocol_udp
0,0.0,ftp_data,SF,6.198479,0.0,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,1,0
1,0.0,other,SF,4.990433,0.0,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0,1
2,0.0,private,S0,0.0,0.0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1,0
3,0.0,http,SF,5.451038,9.006264,0,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,1,0
4,0.0,http,SF,5.298317,6.042633,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0


Encode categorical features (high cardinality, frequency-encode):

- service
- flag


In [7]:
# Frequency encode high cardinality features (service: 70 values, flag: 11 values)
service_freq = df['service'].value_counts().to_dict()
flag_freq = df['flag'].value_counts().to_dict()

df['service'] = df['service'].map(service_freq)
df['flag'] = df['flag'].map(flag_freq)

print(f'Frequency encoded - Service: {df["service"].head().tolist()}')
print(f'Frequency encoded - Flag: {df["flag"].head().tolist()}')

Frequency encoded - Service: [6860, 4359, 21853, 40338, 40338]
Frequency encoded - Flag: [74945, 74945, 34851, 74945, 74945]


In [8]:
df.head(10)

Unnamed: 0,duration,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_tcp,protocol_udp
0,0.0,6860,74945,6.198479,0.0,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,1,0
1,0.0,4359,74945,4.990433,0.0,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0,1
2,0.0,21853,34851,0.0,0.0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1,0
3,0.0,40338,74945,5.451038,9.006264,0,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,1,0
4,0.0,40338,74945,5.298317,6.042633,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0
5,0.0,21853,11233,0.0,0.0,0,0,0,0,0,...,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,1,0
6,0.0,21853,34851,0.0,0.0,0,0,0,0,0,...,0.04,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1,0
7,0.0,21853,34851,0.0,0.0,0,0,0,0,0,...,0.06,0.07,0.0,0.0,1.0,1.0,0.0,0.0,1,0
8,0.0,78,34851,0.0,0.0,0,0,0,0,0,...,0.09,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1,0
9,0.0,21853,34851,0.0,0.0,0,0,0,0,0,...,0.05,0.06,0.0,0.0,1.0,1.0,0.0,0.0,1,0


In [9]:
# Re-append attack_type column for reference
df['attack_type'] = attack_col
print(f'Final training shape: {df.shape}')

Final training shape: (125973, 40)


Save preprocessed training data


In [10]:
df.to_csv('../data/preproc_kdd_train.csv', index=False)
print(f'Saved: ../data/preproc_kdd_train.csv \t{df.shape}')

Saved: ../data/preproc_kdd_train.csv 	(125973, 40)


## Process test dataset

Apply the same transformations to the test dataset using the mappings and scaler from training


In [11]:
# Load test data and apply same transformations
df_test = pd.read_csv('../data/kdd_test.csv')
attack_col_test = df_test['attack_type']

# Drop same columns as training
df_test.drop(
  columns=[
    'num_outbound_cmds',
    'difficulty_level',
    'attack_type',
    'dst_host_count',
    'dst_host_srv_count',
  ],
  inplace=True,
)

# Apply log transforms
for feature in log_features:
  df_test[feature] = np.log1p(df_test[feature])

print(f'Test shape after feature engineering: {df_test.shape}')

Test shape after feature engineering: (22544, 38)


In [12]:
# Apply same encoding as training
df_test = pd.get_dummies(
  df_test, columns=['protocol_type'], prefix='protocol', drop_first=True, dtype=int
)
df_test['service'] = df_test['service'].map(service_freq).fillna(0)
df_test['flag'] = df_test['flag'].map(flag_freq).fillna(0)

print(f'Test shape after encoding: {df_test.shape}')

Test shape after encoding: (22544, 39)


In [13]:
# Restore attack_type column
df_test['attack_type'] = attack_col_test
print(f'Final test shape: {df_test.shape}')

Final test shape: (22544, 40)


In [14]:
df_test.to_csv('../data/preproc_kdd_test.csv', index=False)
print(f'Saved: ../data/preproc_kdd_test.csv \t{df_test.shape}')

Saved: ../data/preproc_kdd_test.csv 	(22544, 40)
