In [1]:
import pandas as pd
import ipaddress

In [2]:
mirai_df = pd.read_pickle("../data/blog_eda/mirai.pkl")
benign_df = pd.read_pickle("../data/blog_eda/benign.pkl")

In [3]:
mirai_df.shape

(764137, 8)

In [4]:
# count missing values
mirai_df.isnull().sum()

Timestamp                0
Source IP           566436
Destination IP      566436
Source Port         610036
Destination Port    610036
Payload                  0
Packet Length            0
Protocol            566436
dtype: int64

In [5]:
benign_df.shape

(38642, 8)

In [6]:
# count missing values
benign_df.isnull().sum()

Timestamp               0
Source IP           15586
Destination IP      15586
Source Port         12826
Destination Port    12826
Payload                 0
Packet Length           0
Protocol            15586
dtype: int64

# Drop
One method to clean up data from misleading records that have incomplete information, such as NaN, is to simply drop this data.

In [7]:
mirai_df = mirai_df.dropna(
    subset=["Source IP", "Destination IP", "Source Port", "Destination Port"]
)
mirai_df.shape

(154090, 8)

In [8]:
mirai_df.isnull().sum()

Timestamp           0
Source IP           0
Destination IP      0
Source Port         0
Destination Port    0
Payload             0
Packet Length       0
Protocol            0
dtype: int64

In [9]:
benign_df = benign_df.dropna(
    subset=["Source IP", "Destination IP", "Source Port", "Destination Port"]
)
benign_df.shape

(18615, 8)

# Impute
Another method to eliminate `None` is to fill in the blank records with meaningful data by observing the distribution, mean, and other statistics of the data. Impute applies only to numerical data.

In [10]:
# re-load the original dataset
mirai_df = pd.read_pickle("../data/blog_eda/mirai.pkl")
benign_df = pd.read_pickle("../data/blog_eda/benign.pkl")

In [11]:
# convert ip address to numeric values
def ip_to_numeric(ip):
    if ip:
        ip_obj = ipaddress.ip_interface(ip)
        return int(ip_obj.network.network_address)

    return None

In [12]:
# convert IPs to numeric for mirai dataset
mirai_df["Source IP"] = mirai_df["Source IP"].apply(ip_to_numeric)
mirai_df["Destination IP"] = mirai_df["Destination IP"].apply(ip_to_numeric)

In [14]:
from feature_engine.imputation import MeanMedianImputer

# Initialize the MeanMedianImputer
imputer = MeanMedianImputer(
    imputation_method="median",  # you can use other statistics: mean, std dev etc.
    variables=[
        "Source IP",
        "Destination IP",
        "Source Port",
        "Destination Port",
        "Protocol",
    ],
)

# Apply imputation
mirai_df = imputer.fit_transform(mirai_df)

mirai_df

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol
0,1540446382.933899,3.232236e+09,8.739992e+08,21074.0,80.0,Padding,26,6.0
1,1540446382.933904,3.232236e+09,8.740708e+08,20532.0,8280.0,Padding,26,6.0
2,1540446382.934426,3.232236e+09,3.232236e+09,32761.0,8280.0,IPerror / TCPerror,52,1.0
3,1540446382.934636,3.232236e+09,3.232236e+09,32761.0,8280.0,IPerror / TCPerror,52,1.0
4,1540446383.291054,3.232236e+09,2.063133e+09,32761.0,8280.0,ARP who has 192.168.2.106 says 192.168.2.109 /...,60,17.0
...,...,...,...,...,...,...,...,...
764132,1540453519.837515,3.232236e+09,2.063133e+09,32761.0,8280.0,ARP who has 192.168.2.165 says 192.168.2.110 /...,60,17.0
764133,1540453519.839396,3.232236e+09,2.063133e+09,32761.0,8280.0,ARP who has 192.168.2.166 says 192.168.2.110 /...,60,17.0
764134,1540453519.840611,3.232236e+09,2.063133e+09,32761.0,8280.0,ARP who has 192.168.2.167 says 192.168.2.110 /...,60,17.0
764135,1540453519.842369,3.232236e+09,2.063133e+09,32761.0,8280.0,ARP who has 192.168.2.168 says 192.168.2.110 /...,60,17.0


# Normalize
If we have extreme values we may want to reshape them by projecting to a specific range, ex. `(0, 1)`.

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

mirai_df["Packet Length"] = scaler.fit_transform(
    mirai_df["Packet Length"].values.reshape(-1, 1)
)
mirai_df

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol
0,1540446382.933899,3.232236e+09,8.739992e+08,21074.0,80.0,Padding,-2.908731,6.0
1,1540446382.933904,3.232236e+09,8.740708e+08,20532.0,8280.0,Padding,-2.908731,6.0
2,1540446382.934426,3.232236e+09,3.232236e+09,32761.0,8280.0,IPerror / TCPerror,-0.472762,1.0
3,1540446382.934636,3.232236e+09,3.232236e+09,32761.0,8280.0,IPerror / TCPerror,-0.472762,1.0
4,1540446383.291054,3.232236e+09,2.063133e+09,32761.0,8280.0,ARP who has 192.168.2.106 says 192.168.2.109 /...,0.276767,17.0
...,...,...,...,...,...,...,...,...
764132,1540453519.837515,3.232236e+09,2.063133e+09,32761.0,8280.0,ARP who has 192.168.2.165 says 192.168.2.110 /...,0.276767,17.0
764133,1540453519.839396,3.232236e+09,2.063133e+09,32761.0,8280.0,ARP who has 192.168.2.166 says 192.168.2.110 /...,0.276767,17.0
764134,1540453519.840611,3.232236e+09,2.063133e+09,32761.0,8280.0,ARP who has 192.168.2.167 says 192.168.2.110 /...,0.276767,17.0
764135,1540453519.842369,3.232236e+09,2.063133e+09,32761.0,8280.0,ARP who has 192.168.2.168 says 192.168.2.110 /...,0.276767,17.0


# Extract
Features may be hidden in specific recorts, such as a domain in a DNS packet payload. We extract these with regular expressions or proper python libraries.

In [16]:
import re

# Define a regular expression pattern to match DNS queries
domain_regex = r'DNS Qry "b\'([^\']+)\''


# Extract domains from payload using regex and create a new column
mirai_df["Domain"] = mirai_df["Payload"].str.extract(domain_regex)

In [18]:
# print just the domains, not the packets that did not have a domain
mirai_df["Domain"].dropna()

7         xmpp.samsungsmartcam.com.Speedport_W_724V_0101...
12                                xmpp.samsungsmartcam.com.
13                                        time.windows.com.
14                                 device-abnormal.eye4.cn.
15                                 device-abnormal.eye4.cn.
                                ...                        
763977                            xmpp.samsungsmartcam.com.
764108                             device-abnormal.eye4.cn.
764122                                    time.windows.com.
764123                                    time.windows.com.
764136    north-america.pool.ntp.org.Speedport_W_724V_01...
Name: Domain, Length: 40459, dtype: object

# Transform
We can transform numeric features to another range, such as the Min/Max range, to make them equally important, improve their consistency, especially if their location in a coordinate space affects the ML algorithm.

In [19]:
# load numeric data
mirai_flow_df_numeric = pd.read_pickle("../data/blog_eda/mirai_flow_numeric.pkl")
benign_flow_df_numeric = pd.read_pickle("../data/blog_eda/mirai_flow_numeric.pkl")

In [20]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Convert numeric features to a numpy array for the MinMax Scaler
packet_features = mirai_flow_df_numeric.to_numpy()

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the data using MinMaxScaler
scaled_features = scaler.fit_transform(packet_features)

# Display the original and scaled features
print("Original Features:")
print(packet_features)
print("\nScaled Features (Min-Max Scaling):")
print(scaled_features)

Original Features:
[[6.80000000e+01 6.70000000e+01 1.70000000e+01 ... 6.13112100e+03
  0.00000000e+00 4.29496730e+09]
 [2.30000000e+01 2.18970000e+04 6.00000000e+00 ... 2.37557245e+03
  3.23223603e+09 3.23223614e+09]
 [2.30000000e+01 4.67340000e+04 6.00000000e+00 ... 4.79078110e+01
  3.23223603e+09 3.23223614e+09]
 ...
 [4.24140000e+04 1.23000000e+02 1.70000000e+01 ... 0.00000000e+00
  3.23223623e+09 3.63263086e+09]
 [4.91340000e+04 1.23000000e+02 1.70000000e+01 ... 0.00000000e+00
  3.23223623e+09 1.12323251e+09]
 [5.09600000e+04 1.23000000e+02 1.70000000e+01 ... 0.00000000e+00
  3.23223623e+09 1.12323251e+09]]

Scaled Features (Min-Max Scaling):
[[1.04187415e-03 6.74391515e-04 1.00000000e+00 ... 8.59241670e-01
  0.00000000e+00 1.00000000e+00]
 [3.52398609e-04 3.35264545e-01 0.00000000e+00 ... 3.32922941e-01
  9.99999940e-01 7.44549488e-01]
 [3.52398609e-04 7.15943228e-01 0.00000000e+00 ... 6.71400671e-03
  9.99999940e-01 7.44549488e-01]
 ...
 [6.49853678e-01 1.53270799e-03 1.00000000e