# Feature Engineering and Scaling

## Feature Engineering

In [1]:
import pandas as pd

# Load datasets
train_df = pd.read_csv("../datasets/UNSW_NB15_encoded_training-set.csv")
test_df = pd.read_csv("../datasets/UNSW_NB15_encoded_testing-set.csv")

In [2]:
# Overview of train_df
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175341 entries, 0 to 175340
Data columns (total 65 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ackdat             175341 non-null  float64
 1   attack_cat         175341 non-null  int64  
 2   ct_dst_ltm         175341 non-null  int64  
 3   ct_dst_sport_ltm   175341 non-null  int64  
 4   ct_dst_src_ltm     175341 non-null  int64  
 5   ct_flw_http_mthd   175341 non-null  int64  
 6   ct_ftp_cmd         175341 non-null  int64  
 7   ct_src_dport_ltm   175341 non-null  int64  
 8   ct_src_ltm         175341 non-null  int64  
 9   ct_srv_dst         175341 non-null  int64  
 10  ct_srv_src         175341 non-null  int64  
 11  ct_state_ttl       175341 non-null  int64  
 12  dbytes             175341 non-null  int64  
 13  dinpkt             175341 non-null  float64
 14  djit               175341 non-null  float64
 15  dload              175341 non-null  float64
 16  dl

In [3]:
# Overview of test_df
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82332 entries, 0 to 82331
Data columns (total 65 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ackdat             82332 non-null  float64
 1   attack_cat         82332 non-null  int64  
 2   ct_dst_ltm         82332 non-null  int64  
 3   ct_dst_sport_ltm   82332 non-null  int64  
 4   ct_dst_src_ltm     82332 non-null  int64  
 5   ct_flw_http_mthd   82332 non-null  int64  
 6   ct_ftp_cmd         82332 non-null  int64  
 7   ct_src_dport_ltm   82332 non-null  int64  
 8   ct_src_ltm         82332 non-null  int64  
 9   ct_srv_dst         82332 non-null  int64  
 10  ct_srv_src         82332 non-null  int64  
 11  ct_state_ttl       82332 non-null  int64  
 12  dbytes             82332 non-null  int64  
 13  dinpkt             82332 non-null  float64
 14  djit               82332 non-null  float64
 15  dload              82332 non-null  float64
 16  dloss              823

In [4]:
# Create interaction features for train_df
train_df["sload_dload_product"] = train_df["sload"] * train_df["dload"]
train_df["sload_dload_ratio"] = train_df["sload"] / (train_df["dload"] + 1)
train_df["sload_dload_sum"] = train_df["sload"] + train_df["dload"]

In [5]:
# Create interaction features for test_df
test_df["sload_dload_product"] = test_df["sload"] * test_df["dload"]
test_df["sload_dload_ratio"] = test_df["sload"] / (test_df["dload"] + 1)
test_df["sload_dload_sum"] = test_df["sload"] + test_df["dload"]

**Log Transform Skewed Features**

We have identified the highlt skewed data in `01_EDA.ipynb`. Now, we will use it!

In [6]:
high_skewed_features = [
    "trans_depth", "response_body_len", "sbytes", "sloss", "dloss", "spkts", 
    "dbytes", "dpkts", "dinpkt", "djit", "ct_flw_http_mthd", "sjit", 
    "sload", "dload", "sinpkt", "dur", "synack", "ackdat", "smean", 
    "rate", "dmean", "ct_src_dport_ltm", "ct_dst_ltm", "ct_src_ltm", 
    "ct_dst_sport_ltm", "ct_dst_src_ltm", "ct_srv_dst", "ct_srv_src", 
    "stcpb", "dtcpb"
]

In [7]:
import numpy as np

# Apply log1p transformation to all skewed features
for feature in high_skewed_features:
    if feature in train_df.columns:  # Ensure column exists in train_df
        train_df[feature] = np.log1p(train_df[feature])
    if feature in test_df.columns:  # Ensure column exists in test_df
        test_df[feature] = np.log1p(test_df[feature])

In [8]:
# Recalculate skewness
skewness_after = train_df[high_skewed_features].skew().sort_values(ascending=False)
print("Skewness After Transformation:")
print(skewness_after)

Skewness After Transformation:
response_body_len    4.168025
ct_flw_http_mthd     4.051059
synack               3.432785
ackdat               3.427472
dur                  3.366384
trans_depth          2.924598
smean                2.029467
dloss                1.786355
sloss                1.376688
sinpkt               1.240509
ct_dst_sport_ltm     1.235297
ct_src_dport_ltm     1.171539
sbytes               1.169003
spkts                1.116330
dinpkt               0.901420
djit                 0.847629
ct_dst_ltm           0.845749
dpkts                0.784531
ct_src_ltm           0.654360
ct_dst_src_ltm       0.631237
ct_srv_dst           0.475789
ct_srv_src           0.433191
sjit                 0.430264
dbytes               0.371869
dload                0.314908
dmean                0.218497
dtcpb                0.208044
stcpb                0.207972
rate                -0.202975
sload               -0.475929
dtype: float64


**Scaling**

In [9]:
from sklearn.preprocessing import StandardScaler

# Select numerical columns
num_columns = train_df.select_dtypes(include=["float64", "int64"]).columns

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform train and test data
train_df[num_columns] = scaler.fit_transform(train_df[num_columns])
test_df[num_columns] = scaler.transform(test_df[num_columns])

In [11]:
train_df

Unnamed: 0,ackdat,attack_cat,ct_dst_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,ct_flw_http_mthd,ct_ftp_cmd,ct_src_dport_ltm,ct_src_ltm,ct_srv_dst,...,state_no,stcpb,sttl,swin,synack,tcprtt,trans_depth,sload_dload_product,sload_dload_ratio,sload_dload_sum
0,-0.528664,-0.126454,-0.958048,-0.641276,-1.064936,-0.312637,-0.118590,-0.713075,-1.093992,-1.167162,...,False,1.012541,0.703839,1.092456,-0.510116,-0.521660,-0.334225,-0.238954,-0.388477,-0.393922
1,-0.528664,-0.126454,-0.958048,-0.641276,-0.654282,-0.312637,-0.118590,-0.713075,-1.093992,0.128233,...,False,1.090596,-1.141901,1.092456,-0.510116,-0.521660,-0.334225,-0.237764,-0.388477,-0.391321
2,0.808064,-0.126454,-0.493499,-0.641276,-0.362918,-0.312637,-0.118590,-0.713075,-0.630503,0.128233,...,False,1.128512,-1.141901,1.092456,1.011431,0.888444,-0.334225,-0.238961,-0.388477,-0.393710
3,-0.528664,-0.126454,-0.493499,-0.641276,-0.362918,-0.312637,7.814915,-0.713075,-0.630503,-1.167162,...,False,1.067170,-1.141901,1.092456,-0.510116,-0.521660,-0.334225,-0.238986,-0.388477,-0.394010
4,0.983219,-0.126454,-0.493499,-0.641276,1.994143,-0.312637,-0.118590,-0.264706,-0.630503,1.930517,...,False,1.141845,0.723268,1.092456,1.243236,1.096172,-0.334225,-0.238979,-0.388477,-0.393976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,-0.528664,0.740389,1.935725,1.717306,1.493116,-0.312637,-0.118590,2.079913,1.793180,1.444519,...,False,-0.904695,0.723268,-0.915407,-0.510116,-0.521660,-0.334225,-0.238989,-0.119617,-0.124703
175337,1.119340,1.607231,-0.958048,-0.641276,-0.654282,-0.312637,-0.118590,-0.713075,-1.093992,-1.167162,...,False,1.176661,0.723268,1.092456,0.414154,0.731463,-0.334225,-0.238976,-0.388477,-0.393969
175338,-0.528664,0.740389,-0.163896,0.198868,0.905877,-0.312637,-0.118590,0.053417,-0.301652,0.768339,...,False,-0.904695,0.723268,-0.915407,-0.510116,-0.521660,-0.334225,-0.238989,-0.119617,-0.124703
175339,-0.528664,0.740389,2.182182,1.800930,1.710980,-0.312637,-0.118590,2.317786,2.039075,1.666951,...,False,-0.904695,0.723268,-0.915407,-0.510116,-0.521660,-0.334225,-0.238989,-0.119617,-0.124703


In [12]:
train_df

Unnamed: 0,ackdat,attack_cat,ct_dst_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,ct_flw_http_mthd,ct_ftp_cmd,ct_src_dport_ltm,ct_src_ltm,ct_srv_dst,...,state_no,stcpb,sttl,swin,synack,tcprtt,trans_depth,sload_dload_product,sload_dload_ratio,sload_dload_sum
0,-0.528664,-0.126454,-0.958048,-0.641276,-1.064936,-0.312637,-0.118590,-0.713075,-1.093992,-1.167162,...,False,1.012541,0.703839,1.092456,-0.510116,-0.521660,-0.334225,-0.238954,-0.388477,-0.393922
1,-0.528664,-0.126454,-0.958048,-0.641276,-0.654282,-0.312637,-0.118590,-0.713075,-1.093992,0.128233,...,False,1.090596,-1.141901,1.092456,-0.510116,-0.521660,-0.334225,-0.237764,-0.388477,-0.391321
2,0.808064,-0.126454,-0.493499,-0.641276,-0.362918,-0.312637,-0.118590,-0.713075,-0.630503,0.128233,...,False,1.128512,-1.141901,1.092456,1.011431,0.888444,-0.334225,-0.238961,-0.388477,-0.393710
3,-0.528664,-0.126454,-0.493499,-0.641276,-0.362918,-0.312637,7.814915,-0.713075,-0.630503,-1.167162,...,False,1.067170,-1.141901,1.092456,-0.510116,-0.521660,-0.334225,-0.238986,-0.388477,-0.394010
4,0.983219,-0.126454,-0.493499,-0.641276,1.994143,-0.312637,-0.118590,-0.264706,-0.630503,1.930517,...,False,1.141845,0.723268,1.092456,1.243236,1.096172,-0.334225,-0.238979,-0.388477,-0.393976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,-0.528664,0.740389,1.935725,1.717306,1.493116,-0.312637,-0.118590,2.079913,1.793180,1.444519,...,False,-0.904695,0.723268,-0.915407,-0.510116,-0.521660,-0.334225,-0.238989,-0.119617,-0.124703
175337,1.119340,1.607231,-0.958048,-0.641276,-0.654282,-0.312637,-0.118590,-0.713075,-1.093992,-1.167162,...,False,1.176661,0.723268,1.092456,0.414154,0.731463,-0.334225,-0.238976,-0.388477,-0.393969
175338,-0.528664,0.740389,-0.163896,0.198868,0.905877,-0.312637,-0.118590,0.053417,-0.301652,0.768339,...,False,-0.904695,0.723268,-0.915407,-0.510116,-0.521660,-0.334225,-0.238989,-0.119617,-0.124703
175339,-0.528664,0.740389,2.182182,1.800930,1.710980,-0.312637,-0.118590,2.317786,2.039075,1.666951,...,False,-0.904695,0.723268,-0.915407,-0.510116,-0.521660,-0.334225,-0.238989,-0.119617,-0.124703


In [10]:
# Save preprocessed data for later sections
train_df.to_csv("../datasets/UNSW_NB15_preprocessed_training-set.csv", index=False)
test_df.to_csv("../datasets/UNSW_NB15_preprocessed_testing-set.csv", index=False)