In [None]:
import pandas as pd
import numpy as np
import sweetviz as sv
import ipaddress


from scapy.all import PcapReader, IP, TCP, UDP, ICMP
from scipy.stats import ttest_ind, kstest, norm, skew, kurtosis, zscore
from sklearn.linear_model import LinearRegression
from skimpy import skim
from summarytools import dfSummary

In [None]:
pcap_reader = PcapReader("../data/blog_show_data/mirai.pcap")
type(pcap_reader)

# Preprocess

- convert data to streams
- collect some numbers

In [None]:
# Create an empty list to store the data
# data = []

# # Iterate through the packets in the pcap file
# for packet in pcap_reader:
#     # Get the source and destination IP addresses
#     if packet.haslayer(IP):
#         src_ip = packet[IP].src
#         dst_ip = packet[IP].dst
#         protocol = packet[IP].proto
#     else:
#         src_ip = None
#         dst_ip = None
#         protocol = None

#     # Get the source and destination ports and payload
#     if packet.haslayer(TCP):
#         src_port = packet[TCP].sport
#         dst_port = packet[TCP].dport
#         payload = str(packet[TCP].payload)
#         packet_len = len(packet[TCP])
#     elif packet.haslayer(UDP):
#         src_port = packet[UDP].sport
#         dst_port = packet[UDP].dport
#         payload = str(packet[UDP].payload)
#         packet_len = len(packet[UDP])
#     elif packet.haslayer(ICMP):
#         payload = str(packet[ICMP].payload)
#         packet_len = len(packet[ICMP])
#         src_port = None
#         dst_port = None
#     else:
#         src_port = None
#         dst_port = None
#         payload = str(packet.payload)
#         packet_len = len(packet)

#     # Append the data to the list
#     data.append([packet.time, src_ip, dst_ip, src_port, dst_port, payload, packet_len, protocol])

# # Convert the list to a pandas dataframe
# mirai_df = pd.DataFrame(data, columns=['Timestamp', 'Source IP', 'Destination IP', 'Source Port', 'Destination Port', 'Payload', 'Packet Length', 'Protocol'])

# mirai_df

In [None]:
# mirai_df.to_pickle("../data/bsides_aug/mirai.pkl")
mirai_df = pd.read_pickle("../data/bsides_aug/mirai.pkl")

In [None]:
mirai_df

In [None]:
# Create an empty list to store stream data as separate dataframes
dfs = []

# Group packets by src/dst IP and src/dst port
grouped = mirai_df.groupby(
    ["Source IP", "Destination IP", "Source Port", "Destination Port", "Protocol"]
)

# Iterate through each group to extract stream data
for name, group in grouped:
    # Get source/destination IP, port, and protocol
    src_ip, dst_ip, src_port, dst_port, proto = name

    # Get number of packets, total length, and duration of the stream
    num_packets = len(group)
    total_length = group["Packet Length"].sum()
    start_time = group["Timestamp"].min()
    end_time = group["Timestamp"].max()
    duration = float(end_time - start_time)

    # Create a new dataframe with the stream data
    stream_df = pd.DataFrame(
        {
            "Source IP": [src_ip],
            "Destination IP": [dst_ip],
            "Source Port": [src_port],
            "Destination Port": [dst_port],
            "Protocol": [proto],
            "Number of Packets": [num_packets],
            "Total Length": [total_length],
            "Duration": [duration],
        }
    )

    # Add the new dataframe to the list
    dfs.append(stream_df)

# Concatenate all the dataframes in the list into one dataframe
stream_df = pd.concat(dfs, ignore_index=True)

# Print the new dataframe with stream data
print(stream_df)

# EDA

## Descriptive statistics & data

- Describe columns and data types
- Descriptive statistics
  -  count, 
  -  mean, 
  -  standard deviation, 
  -  minimum, 
  -  25th percentile, 
  -  median (50th percentile), 
  -  75th percentile, and 
  -  maximum

In [None]:
# describe, summarize etc.
stream_df.columns

In [None]:
stream_df.dtypes

In [None]:
# descriptive statistics
stream_df.describe()

In [None]:
# correlation matrix for numerical values in dataframe
stream_df.corr()

## Hypothesis testing

- Is the difference between two groups or variables statistically significant?
- Use t-test to compare means of two groups
  - assumes that data follows normal distribution
- Types of variables
  - dependent: the effect of a phenomenon. For example, how does number of HTTP requests mean that a network is compromised?
  - independent: the cause. The number of HTTP requests affects whether a network is compromised.

In [None]:
def hypothesis_testing(df, col1, col2):
    group1 = df[col1]
    group2 = df[col2]
    pvalue = ttest_ind(group1, group2)[1]
    if pvalue < 0.05:
        return "The difference between {} and {} is statistically significant (p < 0.05)".format(
            col1, col2
        )
    else:
        return "The difference between {} and {} is not statistically significant (p >= 0.05)".format(
            col1, col2
        )

In [None]:
hypothesis_testing(stream_df, "Number of Packets", "Total Length")

## Regression Analysis

- Models relationship between a dependent variable and one or more independent variables
- Linear regression
  - fit data in line
  - calculate coefficients

In [None]:
def regression_analysis(df, x_cols, y_col):
    X = df[x_cols].values.reshape(-1, len(x_cols))
    y = df[y_col].values.reshape(-1, 1)
    model = LinearRegression().fit(X, y)
    r_sq = model.score(X, y)
    coef = model.coef_
    return {"R-squared": r_sq, "Coefficients": coef}

In [None]:
regression_analysis(stream_df, ["Number of Packets", "Duration"], "Total Length")

## Kolmogorov-Smirnov test

- compare two sample distributions
- useful for fitting to a distribution
- test if two samples from a population:
  - came from a distribution
  - belong to the same distribution
- Uses metric `D`
  - max absolute difference between empirical distribution function of the samples and cumulative distribution of the reference distribution
- Null hypothesis: 
  - samples came from the reference distribution
  - samples came from the same distribution

In [None]:
def kolmogorov_smirnov_test(df, column):
    sample = df[column].values
    _, pvalue = kstest(sample, norm.cdf, args=(sample.mean(), sample.std()))
    if pvalue < 0.05:
        return "The distribution of {} is significantly different from a normal distribution (p < 0.05)".format(
            column
        )
    else:
        return "The distribution of {} is not significantly different from a normal distribution (p >= 0.05)".format(
            column
        )

In [None]:
kolmogorov_smirnov_test(stream_df, "Total Length")

## Skewness and Kyrtosis

- information about the shape of the distribution
- Skewness: measure the degree of asymmetry
  - symmetric: equally balanced around its mean
  - asymmetric: not equally balanced
  - positive skewness: distribution longer on the right side
  - negative skewness: longer on the left
  - 0: completely symmetric
- Kurtosis: peakedness of distribution
  - high: sharp peak, long tails
  - low: flat peak, short tails
  - ex. normal distribution has kurtosis 3, mesokurtic
    - `> 3` leptokurtic
    - `< 3` platykurtic

In [None]:
def skewness_kurtosis(df):
    result = {}
    for col in df.select_dtypes(include=[np.number]).columns:
        result[col + "_skewness"] = skew(df[col])
        result[col + "_kurtosis"] = kurtosis(df[col])
    return result

In [None]:
skewness_kurtosis(stream_df)

## Outliers

- observation that significantly differs from others in a dataset
- Causes
  - measurement errors
  - extreme rare values
- significant impact in statistical analysis
- measurements
  - z-score: `(x - mean) / std_dev`
  - IQR method: this method identifies outliers as observations that are below `Q1 - 1.5IQR` or above `Q3 + 1.5IQR`, where Q1 and Q3 are the first and third quartiles, and IQR is the interquartile range (the difference between Q3 and Q1).
  - visual inspection

In [None]:
def detect_outliers_zscore(df, column, threshold=3):
    zscores = np.abs(zscore(df[column]))
    return df[zscores > threshold]

In [None]:
outliers = detect_outliers_zscore(stream_df, "Total Length", threshold=3)
print(outliers)

# Feature Engineering

In [None]:
# convert ip address to numeric values
def ip_to_numeric(ip):
    ip_obj = ipaddress.ip_interface(ip)
    return int(ip_obj.network.network_address)

In [None]:
stream_df["Source IP Numeric"] = stream_df["Source IP"].apply(ip_to_numeric)
stream_df["Destination IP Numeric"] = stream_df["Destination IP"].apply(ip_to_numeric)

In [None]:
stream_df_numeric = stream_df.drop(columns=["Source IP", "Destination IP"])

In [None]:
stream_df["Duration"] = stream_df_numeric["Duration"].astype(float)

In [None]:
stream_df_numeric.dtypes

## Summaries & Visualizations

In [None]:
skim(stream_df_numeric)

In [None]:
dfSummary(stream_df_numeric)

In [None]:
my_report = sv.analyze(stream_df_numeric)
my_report.show_html()

# Feature engineering: categorical
- one hot encoding for ports
- word2vec encoding for payload (add payload to `stream_df`)

In [None]:
# add back payload
stream_df_engineered = stream_df_numeric.join(mirai_df["Payload"])

In [None]:
# word2vec for payload
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

In [None]:
sentences = [simple_preprocess(payload) for payload in mirai_df["Payload"]]
sentences

In [None]:
model = Word2Vec(sentences=sentences, window=5, min_count=1, workers=4)

In [None]:
stream_df_engineered["Payload_vectors"] = stream_df_engineered["Payload"].apply(
    lambda payload: model.wv[simple_preprocess(payload)]
    if simple_preprocess(payload)
    else None
)

# Model Training
- models
  - xgboost
  - NN
- k-fold cross validation

In [None]:
# what are we training for? benign vs malicious