In [None]:
import pandas as pd

scan = pd.read_csv("../data/blog_eda/scan.csv")
scan.shape

In [None]:
scan.columns

In [None]:
scan.dtypes

In [None]:
scan.describe()

In [None]:
mirai = pd.read_csv("../data/blog_eda/mirai.csv")

mirai.columns

In [None]:
def count_packets(df, column):
    packets = df[column].value_counts()
    return packets


def count_bytes(df, column):
    total_bytes = df.groupby(column)["Length"].sum()
    return total_bytes


def count_packets_stream(df, source_ip, source_port):
    return df[
        (df["source ip"] == source_ip) & (df["source port"] == source_port)
    ].shape[0]


count_bytes(mirai, "Source")

# Stream aggregation:
- H: Stats summarizing the recent traffic from this packet's host (IP)
- HH: Stats summarizing the recent traffic going from this packet's host (IP) to the packet's destination host.
- HpHp: Stats summarizing the recent traffic going from this packet's host+port (IP) to the packet's destination host+port. Example 192.168.4.2:1242 -> 192.168.4.12:80
- HH_jit: Stats summarizing the jitter of the traffic going from this packet's host (IP) to the packet's destination host.

In [None]:
def extract_streams(df: pd.DataFrame) -> pd.DataFrame:
    # Create an empty list to store stream data as separate dataframes
    dfs = []

    # Group packets by src/dst IP and src/dst port
    grouped = df.groupby(
        ["Source IP", "Destination IP", "Source Port", "Destination Port", "Protocol"]
    )

    # Iterate through each group to extract stream data
    for name, group in grouped:
        # Get source/destination IP, port, and protocol
        src_ip, dst_ip, src_port, dst_port, proto = name

        # Get number of packets, total length, and duration of the stream
        num_packets = len(group)
        total_length = group["Packet Length"].sum()
        start_time = group["Timestamp"].min()
        end_time = group["Timestamp"].max()
        duration = float(end_time - start_time)

        # Create a new dataframe with the stream data
        stream_df = pd.DataFrame(
            {
                "Source IP": [src_ip],
                "Destination IP": [dst_ip],
                "Source Port": [src_port],
                "Destination Port": [dst_port],
                "Protocol": [proto],
                "Number of Packets": [num_packets],
                "Total Length": [total_length],
                "Duration": [duration],
            }
        )

        # Add the new dataframe to the list
        dfs.append(stream_df)

    # Concatenate all the dataframes in the list into one dataframe
    stream_df = pd.concat(dfs, ignore_index=True)

    # Return the new dataframe with stream data
    return stream_df

In [None]:
mirai_stream_df = extract_streams(mirai_df)

## Categorical
- one hot encoding for ports
- word2vec encoding for payload (add payload to `stream_df`)

TODO: moved from bsides talk, need to use wordvec for raw data and not streams

In [None]:
# add back payload after all the numeric EDA
mirai_stream_df_engineered = mirai_stream_df_numeric.join(mirai_df["Payload"])
benign_stream_df_engineered = benign_stream_df_numeric.join(mirai_df["Payload"])

In [None]:
# create sentences from payload to use in vectorization
mirai_sentences = [simple_preprocess(payload) for payload in mirai_df["Payload"]]
benign_sentences = [simple_preprocess(payload) for payload in benign_df["Payload"]]

In [None]:
mirai_model = Word2Vec(sentences=mirai_sentences, window=5, min_count=1, workers=4)
benign_model = Word2Vec(sentences=benign_sentences, window=5, min_count=1, workers=4)

In [None]:
mirai_stream_df_engineered["Payload_vectors"] = mirai_stream_df_engineered[
    "Payload"
].apply(
    lambda payload: mirai_model.wv[simple_preprocess(payload)]
    if simple_preprocess(payload)
    else None
)

In [None]:
def try_preprocess(payload, model):
    try:
        if simple_preprocess(payload):
            return model.wv[simple_preprocess(payload)]
        else:
            return None
    except KeyError:
        return None

In [None]:
benign_stream_df_engineered["Payload_vectors"] = benign_stream_df_engineered[
    "Payload"
].apply(lambda payload: try_preprocess(payload, benign_model))