# Feature Engineering and Scaling

## Feature Engineering

In [1]:
import pandas as pd

# Load datasets
train_df = pd.read_csv("../datasets/UNSW_NB15_training-set.csv")
test_df = pd.read_csv("../datasets/UNSW_NB15_testing-set.csv")

In [2]:
# Overview of train_df
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175341 entries, 0 to 175340
Data columns (total 45 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 175341 non-null  int64  
 1   dur                175341 non-null  float64
 2   proto              175341 non-null  object 
 3   service            175341 non-null  object 
 4   state              175341 non-null  object 
 5   spkts              175341 non-null  int64  
 6   dpkts              175341 non-null  int64  
 7   sbytes             175341 non-null  int64  
 8   dbytes             175341 non-null  int64  
 9   rate               175341 non-null  float64
 10  sttl               175341 non-null  int64  
 11  dttl               175341 non-null  int64  
 12  sload              175341 non-null  float64
 13  dload              175341 non-null  float64
 14  sloss              175341 non-null  int64  
 15  dloss              175341 non-null  int64  
 16  si

In [3]:
# Overview of test_df
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82332 entries, 0 to 82331
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 82332 non-null  int64  
 1   dur                82332 non-null  float64
 2   proto              82332 non-null  object 
 3   service            82332 non-null  object 
 4   state              82332 non-null  object 
 5   spkts              82332 non-null  int64  
 6   dpkts              82332 non-null  int64  
 7   sbytes             82332 non-null  int64  
 8   dbytes             82332 non-null  int64  
 9   rate               82332 non-null  float64
 10  sttl               82332 non-null  int64  
 11  dttl               82332 non-null  int64  
 12  sload              82332 non-null  float64
 13  dload              82332 non-null  float64
 14  sloss              82332 non-null  int64  
 15  dloss              82332 non-null  int64  
 16  sinpkt             823

In [4]:
# Create interaction features for train_df
train_df["sload_dload_product"] = train_df["sload"] * train_df["dload"]
train_df["sload_dload_ratio"] = train_df["sload"] / (train_df["dload"] + 1)
train_df["sload_dload_sum"] = train_df["sload"] + train_df["dload"]

In [5]:
# Create interaction features for test_df
test_df["sload_dload_product"] = test_df["sload"] * test_df["dload"]
test_df["sload_dload_ratio"] = test_df["sload"] / (test_df["dload"] + 1)
test_df["sload_dload_sum"] = test_df["sload"] + test_df["dload"]

**Log Transform Skewed Features**

We have identified the highlt skewed data in `01_EDA.ipynb`. Now, we will use it!

In [6]:
high_skewed_features = [
    "trans_depth", "response_body_len", "sbytes", "sloss", "dloss", "spkts", 
    "dbytes", "dpkts", "dinpkt", "djit", "ct_flw_http_mthd", "sjit", 
    "sload", "dload", "sinpkt", "dur", "synack", "ackdat", "smean", 
    "rate", "dmean", "ct_src_dport_ltm", "ct_dst_ltm", "ct_src_ltm", 
    "ct_dst_sport_ltm", "ct_dst_src_ltm", "ct_srv_dst", "ct_srv_src", 
    "stcpb", "dtcpb"
]

In [7]:
import numpy as np

# Apply log1p transformation to all skewed features
for feature in high_skewed_features:
    if feature in train_df.columns:  # Ensure column exists in train_df
        train_df[feature] = np.log1p(train_df[feature])
    if feature in test_df.columns:  # Ensure column exists in test_df
        test_df[feature] = np.log1p(test_df[feature])

In [8]:
# Recalculate skewness
skewness_after = train_df[high_skewed_features].skew().sort_values(ascending=False)
print("Skewness After Transformation:")
print(skewness_after)

Skewness After Transformation:
response_body_len    4.168025
ct_flw_http_mthd     4.051059
synack               3.432785
ackdat               3.427472
dur                  3.366384
trans_depth          2.924598
smean                2.029467
dloss                1.786355
sloss                1.376688
sinpkt               1.240509
ct_dst_sport_ltm     1.235297
ct_src_dport_ltm     1.171539
sbytes               1.169003
spkts                1.116330
dinpkt               0.901420
djit                 0.847629
ct_dst_ltm           0.845749
dpkts                0.784531
ct_src_ltm           0.654360
ct_dst_src_ltm       0.631237
ct_srv_dst           0.475789
ct_srv_src           0.433191
sjit                 0.430264
dbytes               0.371869
dload                0.314908
dmean                0.218497
dtcpb                0.208044
stcpb                0.207972
rate                -0.202975
sload               -0.475929
dtype: float64


**Scaling**

We need to make sure that the `label` column is not scaled as it is a classification problem!

In [9]:
# Separate the label column
y_train = train_df["label"]
y_test = test_df["label"]

# Drop the label column from features
X_train = train_df.drop(columns=["label"])
X_test = test_df.drop(columns=["label"])

In [10]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns
train_num_columns = X_train.select_dtypes(include=["float64", "int64"]).columns
test_num_columns = X_test.select_dtypes(include=["float64", "int64"]).columns

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform only numerical columns
X_train[train_num_columns] = scaler.fit_transform(X_train[train_num_columns])
X_test[test_num_columns] = scaler.transform(X_test[test_num_columns])

In [11]:
# Reattach the label column for saving
X_train["label"] = y_train.reset_index(drop=True)
X_test["label"] = y_test.reset_index(drop=True)

In [13]:
X_train["label"].unique()

array([0, 1])

In [14]:
X_test["label"].unique()

array([0, 1])

In [16]:
X_train

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,sload_dload_product,sload_dload_ratio,sload_dload_sum,label
0,-1.732041,-0.342123,tcp,-,FIN,-0.068307,0.140261,-0.441590,0.334160,-0.852011,...,-0.118590,-0.312637,-1.093992,-1.167162,-0.126508,Normal,-0.238954,-0.388477,-0.393922,0
1,-1.732021,0.247757,tcp,-,FIN,0.597577,1.463624,0.188354,1.720441,-0.838341,...,-0.118590,-0.312637,-1.093992,0.128233,-0.126508,Normal,-0.237764,-0.388477,-0.391321,0
2,-1.732001,0.956178,tcp,-,FIN,0.151267,0.928674,-0.234394,1.427966,-1.237125,...,-0.118590,-0.312637,-0.630503,0.128233,-0.126508,Normal,-0.238961,-0.388477,-0.393710,0
3,-1.731982,0.989886,tcp,ftp,FIN,0.472549,0.755846,0.094296,0.711339,-1.245081,...,7.814915,-0.312637,-0.630503,-1.167162,-0.126508,Normal,-0.238986,-0.388477,-0.394010,0
4,-1.731962,0.049848,tcp,-,FIN,0.326594,0.357032,-0.003462,0.445572,-1.040160,...,-0.118590,-0.312637,-0.630503,1.930517,-0.126508,Normal,-0.238979,-0.388477,-0.393976,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,1.731962,-0.517281,udp,dns,INT,-0.808594,-0.896615,-0.931933,-0.966509,0.905728,...,-0.118590,-0.312637,1.793180,1.444519,-0.126508,Generic,-0.238989,-0.119617,-0.124703,1
175337,1.731982,0.108080,tcp,-,FIN,0.326594,0.518940,0.086565,0.515589,-1.038493,...,-0.118590,-0.312637,-1.093992,-1.167162,-0.126508,Shellcode,-0.238976,-0.388477,-0.393969,1
175338,1.732001,-0.517281,udp,dns,INT,-0.808594,-0.896615,-0.931933,-0.966509,0.905728,...,-0.118590,-0.312637,-0.301652,0.768339,-0.126508,Generic,-0.238989,-0.119617,-0.124703,1
175339,1.732021,-0.517281,udp,dns,INT,-0.808594,-0.896615,-0.931933,-0.966509,0.905728,...,-0.118590,-0.312637,2.039075,1.666951,-0.126508,Generic,-0.238989,-0.119617,-0.124703,1


In [17]:
X_test

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,sload_dload_product,sload_dload_ratio,sload_dload_sum,label
0,-1.732041,-0.517278,udp,-,INT,-0.808594,-0.896615,-0.047959,-0.966509,0.857408,...,-0.11859,-0.312637,-1.093992,-0.747899,-0.126508,Normal,-0.238989,0.568615,0.564753,0
1,-1.732021,-0.517282,udp,-,INT,-0.808594,-0.896615,0.716750,-0.966509,0.934090,...,-0.11859,-0.312637,-1.093992,-0.747899,-0.126508,Normal,-0.238989,4.286514,4.289264,0
2,-1.732001,-0.517287,udp,-,INT,-0.808594,-0.896615,0.414598,-0.966509,1.047265,...,-0.11859,-0.312637,-1.093992,-0.450427,-0.126508,Normal,-0.238989,4.145362,4.147862,0
3,-1.731982,-0.517285,udp,-,INT,-0.808594,-0.896615,0.311339,-0.966509,1.003363,...,-0.11859,-0.312637,-0.630503,-0.450427,-0.126508,Normal,-0.238989,2.795399,2.795497,0
4,-1.731962,-0.517279,udp,-,INT,-0.808594,-0.896615,0.830108,-0.966509,0.880358,...,-0.11859,-0.312637,-0.630503,-0.450427,-0.126508,Normal,-0.238989,4.124137,4.126598,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82327,-0.105558,-0.517287,udp,-,INT,-0.808594,-0.896615,-0.986875,-0.966509,1.047265,...,-0.11859,-0.312637,-0.630503,-1.167162,-0.126508,Normal,-0.238989,0.053020,0.048241,0
82328,-0.105539,0.620756,tcp,-,FIN,0.891554,0.518940,2.122046,0.515589,-1.112918,...,-0.11859,-0.312637,-0.301652,-0.747899,-0.126508,Normal,-0.238908,-0.388477,-0.393371,0
82329,-0.105519,-0.517294,arp,-,INT,-1.162850,-0.896615,-1.472337,-0.966509,-1.891934,...,-0.11859,-0.312637,-1.093992,-1.167162,7.904641,Normal,-0.238989,-0.388477,-0.394042,0
82330,-0.105499,-0.517294,arp,-,INT,-1.162850,-0.896615,-1.472337,-0.966509,-1.891934,...,-0.11859,-0.312637,-1.093992,-1.167162,7.904641,Normal,-0.238989,-0.388477,-0.394042,0


In [18]:
# Save preprocessed data for later sections
X_train.to_csv("../datasets/UNSW_NB15_scaled_training-set.csv", index=False)
X_test.to_csv("../datasets/UNSW_NB15_scaled_testing-set.csv", index=False)