In [None]:
import pandas as pd
import numpy as np
import sweetviz as sv
import ipaddress

from scapy.all import PcapReader, IP, TCP, UDP, ICMP
from scipy.stats import ttest_ind, kstest, norm, skew, kurtosis, zscore
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

from skimpy import skim
from summarytools import dfSummary
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [None]:
pcap_reader_mirai = PcapReader("../data/blog_eda/mirai.pcap")
pcap_reader_benign = PcapReader("../data/blog_eda/benign.pcapng")

# Preprocess

- convert data to streams
- collect some numbers

In [None]:
def pcap_to_dataframe(pcap_reader: PcapReader) -> pd.DataFrame:
    """Converts raw packet capture to a Pandas dataframe.

    Args:
        pcap_reader (PcapReader): packet capture read using scapy

    Returns:
        pd.DataFrame: dataframe with pcap data
    """
    # Create an empty list to store the data
    data = []

    # Iterate through the packets in the pcap file
    for packet in pcap_reader:
        # Get the source and destination IP addresses
        if packet.haslayer(IP):
            src_ip = packet[IP].src
            dst_ip = packet[IP].dst
            protocol = packet[IP].proto
        else:
            src_ip = None
            dst_ip = None
            protocol = None

        # Get the source and destination ports and payload
        if packet.haslayer(TCP):
            src_port = packet[TCP].sport
            dst_port = packet[TCP].dport
            payload = str(packet[TCP].payload)
            packet_len = len(packet[TCP])
        elif packet.haslayer(UDP):
            src_port = packet[UDP].sport
            dst_port = packet[UDP].dport
            payload = str(packet[UDP].payload)
            packet_len = len(packet[UDP])
        elif packet.haslayer(ICMP):
            payload = str(packet[ICMP].payload)
            packet_len = len(packet[ICMP])
            src_port = None
            dst_port = None
        else:
            src_port = None
            dst_port = None
            payload = str(packet.payload)
            packet_len = len(packet)

        # Append the data to the list
        data.append(
            [
                packet.time,
                src_ip,
                dst_ip,
                src_port,
                dst_port,
                payload,
                packet_len,
                protocol,
            ]
        )

    # Convert the list to a pandas dataframe
    df = pd.DataFrame(
        data,
        columns=[
            "Timestamp",
            "Source IP",
            "Destination IP",
            "Source Port",
            "Destination Port",
            "Payload",
            "Packet Length",
            "Protocol",
        ],
    )

    return df

In [None]:
# comment this out if this is the first run
# because it takes too much time to convert to dataframe, use pkl on consequent runs
# mirai_df = pcap_to_dataframe(pcap_reader_mirai)
# benign_df = pcap_to_dataframe(pcap_reader_benign)

In [None]:
# save to pkl since dataframe conversion takes a long time
# mirai_df.to_pickle("../data/blog_eda/mirai.pkl")
# benign_df.to_pickle("../data/blog_eda/benign.pkl")

In [None]:
mirai_df = pd.read_pickle("../data/blog_eda/mirai.pkl")
benign_df = pd.read_pickle("../data/blog_eda/benign.pkl")

In [None]:
mirai_df

In [None]:
benign_df

In [None]:
def extract_flows(df: pd.DataFrame) -> pd.DataFrame:
    # Create an empty list to store stream data as separate dataframes
    dfs = []

    # Group packets by src/dst IP and src/dst port
    grouped = df.groupby(
        ["Source IP", "Destination IP", "Source Port", "Destination Port", "Protocol"]
    )

    # Iterate through each group to extract stream data
    for name, group in grouped:
        # Get source/destination IP, port, and protocol
        src_ip, dst_ip, src_port, dst_port, proto = name

        # Get number of packets, total length, and duration of the stream
        num_packets = len(group)
        total_length = group["Packet Length"].sum()
        start_time = group["Timestamp"].min()
        end_time = group["Timestamp"].max()
        duration = float(end_time - start_time)

        # Create a new dataframe with the stream data
        flow_df = pd.DataFrame(
            {
                "Source IP": [src_ip],
                "Destination IP": [dst_ip],
                "Source Port": [src_port],
                "Destination Port": [dst_port],
                "Protocol": [proto],
                "Number of Packets": [num_packets],
                "Total Length": [total_length],
                "Duration": [duration],
            }
        )

        # Add the new dataframe to the list
        dfs.append(flow_df)

    # Concatenate all the dataframes in the list into one dataframe
    flow_df = pd.concat(dfs, ignore_index=True)

    # Return the new dataframe with stream data
    return flow_df

In [None]:
mirai_flow_df = extract_flows(mirai_df)
benign_flow_df = extract_flows(benign_df)

# EDA

Exploratory Data Analysis approaches the dataset as a black box that we need to visualize and analyze statistically with the following goals:
- get insights about our data
- test hypotheses
- decide on models and further processing, such as feature engineering.

EDA can be performed for benign and malicious data. Here we are looking at EDA only for malicious data, however the same functions can be applied to benign.

## Descriptive statistics & data

- Describe columns and data types
- Descriptive statistics
  -  count, 
  -  mean, 
  -  standard deviation, 
  -  minimum, 
  -  25th percentile, 
  -  median (50th percentile), 
  -  75th percentile, and 
  -  maximum

In [None]:
# describe, summarize etc.
mirai_flow_df.columns

In [None]:
mirai_flow_df.dtypes

In [None]:
# descriptive statistics
mirai_flow_df.describe()

In [None]:
# correlation matrix for numerical values in dataframe
mirai_flow_df.corr()

## Hypothesis testing

- Is the difference between two groups or variables statistically significant?
- Use t-test to compare means of two groups
  - assumes that data follows normal distribution
- Types of variables
  - dependent: the effect of a phenomenon. For example, how does number of HTTP requests mean that a network is compromised?
  - independent: the cause. The number of HTTP requests affects whether a network is compromised.

In [None]:
def hypothesis_testing(df1, df2, col):
    group1 = df1[col]
    group2 = df2[col]
    pvalue = ttest_ind(group1, group2)[1]
    if pvalue < 0.05:
        return "The difference between benign and mirai traffic {} is statistically significant (p < 0.05)".format(
            col
        )
    else:
        return "The difference between benign and mirai traffic {} is not statistically significant (p >= 0.05)".format(
            col
        )

In [None]:
hypothesis_testing(mirai_flow_df, benign_flow_df, "Number of Packets")

## Outliers

- observation that significantly differs from others in a dataset
- Causes
  - measurement errors
  - extreme rare values
- significant impact in statistical analysis
- measurements
  - z-score: `(x - mean) / std_dev`
  - IQR method: this method identifies outliers as observations that are below `Q1 - 1.5IQR` or above `Q3 + 1.5IQR`, where Q1 and Q3 are the first and third quartiles, and IQR is the interquartile range (the difference between Q3 and Q1).
  - visual inspection

In [None]:
def detect_outliers_zscore(df, column, threshold=3):
    zscores = np.abs(zscore(df[column]))
    return df[zscores > threshold]

In [None]:
outliers = detect_outliers_zscore(mirai_flow_df, "Total Length", threshold=3)
print(outliers)

# Feature Engineering

## Numerical

In [None]:
# convert ip address to numeric values
def ip_to_numeric(ip):
    ip_obj = ipaddress.ip_interface(ip)
    return int(ip_obj.network.network_address)

In [None]:
# convert IPs to numeric mirai data
mirai_flow_df["Source IP Numeric"] = mirai_flow_df["Source IP"].apply(ip_to_numeric)
mirai_flow_df["Destination IP Numeric"] = mirai_flow_df["Destination IP"].apply(
    ip_to_numeric
)

In [None]:
# convert IPs to numeric benign data
benign_flow_df["Source IP Numeric"] = benign_flow_df["Source IP"].apply(ip_to_numeric)
benign_flow_df["Destination IP Numeric"] = benign_flow_df["Destination IP"].apply(
    ip_to_numeric
)

In [None]:
# get rid of non numeric columns for IPs
mirai_flow_df_numeric = mirai_flow_df.drop(columns=["Source IP", "Destination IP"])
benign_flow_df_numeric = benign_flow_df.drop(columns=["Source IP", "Destination IP"])

In [None]:
# convert duration from object to float
mirai_flow_df["Duration"] = mirai_flow_df_numeric["Duration"].astype(float)
benign_flow_df["Duration"] = benign_flow_df_numeric["Duration"].astype(float)

In [None]:
# check if all data types are numeric now
mirai_flow_df_numeric.dtypes

In [None]:
# save dataframes to pickles, can be used in the next blog
mirai_flow_df_numeric.to_pickle("../data/blog_eda/mirai_flow_numeric.pkl")
benign_flow_df_numeric.to_pickle("../data/blog_eda/benign_flow_numeric.pkl")

# More EDA
After converting all columns to numerical, we can do more exploration.

## Correlation

In [None]:
# Calculate the correlation matrix
correlation_matrix = mirai_flow_df_numeric.corr()

# Print the correlation matrix and autocorrelation
print("Correlation Matrix:")
print(correlation_matrix)

## Autocorrelation

In [None]:
# Calculate the autocorrelation for a specific column (e.g., 'Number of Packets')
autocorrelation = mirai_flow_df_numeric["Number of Packets"].autocorr()

print("\nAutocorrelation for 'Number of Packets':")
print(autocorrelation)

# Summaries & Visualizations

In [None]:
skim(mirai_flow_df_numeric)

In [None]:
skim(benign_flow_df_numeric)

In [None]:
dfSummary(mirai_flow_df_numeric)

In [None]:
dfSummary(benign_flow_df_numeric)

In [None]:
my_report = sv.analyze(mirai_flow_df_numeric)
my_report.show_html()

In [None]:
my_report = sv.analyze(benign_flow_df_numeric)
my_report.show_html()