In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import seaborn as sns
import spacy

from gensim.models import Word2Vec
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from read_pcaps import pcap_to_dataframe

In [None]:
# define a variable that allows you to read prior saved pkl files
READ_FROM_PKL = True

In [None]:
mirai_df = pd.read_pickle("../data/blog_eda/mirai.pkl")
benign_df = pd.read_pickle("../data/blog_eda/benign.pkl")

# Protocol Label Encoding

In [None]:
protocols = [
    "ARP",
    "HTTP",
    "HTTPS",
    "FTP",
    "FTPS",
    "SMTP",
    "POP3",
    "IMAP",
    "Telnet",
    "DNS",
    "DHCP",
    "SNMP",
    "NTP",
    "SSH",
    "SMB",
    "LDAP",
    "SIP",
    "ICMP",
]

In [None]:
def extract_protocol(payload):
    for protocol in protocols:
        if protocol in payload:
            return protocol

In [None]:
benign_df["payload_proto"] = benign_df["Payload"].apply(extract_protocol)
mirai_df["payload_proto"] = mirai_df["Payload"].apply(extract_protocol)

In [None]:
from sklearn.preprocessing import LabelEncoder

mirai_df["protocol_encoded"] = LabelEncoder().fit_transform(mirai_df["payload_proto"])
benign_df["protocol_encoded"] = LabelEncoder().fit_transform(benign_df["payload_proto"])

In [None]:
mirai_df.tail()

# Service one-hot encoding

Read a couple more pcaps with malware, get interesting data

In [None]:
if not READ_FROM_PKL:
    guloader = rdpcap(
        "../data/blog_fe/2023-06-26-guloader-or-modiloader-style-infection-for-Remcos-RAT.pcap"
    )
    picabot = rdpcap(
        "../data/blog_fe/2023-12-18-TA577-Pikabot-infection-with-Cobalt-Strike.pcap"
    )

In [None]:
if not READ_FROM_PKL:
    guloader_df = pcap_to_dataframe(guloader)
    picabot_df = pcap_to_dataframe(picabot)

In [None]:
# save to pkl since dataframe conversion takes a long time
if not READ_FROM_PKL:
    guloader_df.to_pickle("../data/blog_fe/guloader.pkl")
    picabot_df.to_pickle("../data/blog_fe/picabot.pkl")

In [None]:
network_protocols = {
    1: "ICMP",
    6: "TCP",
    17: "UDP",
    23: "Telnet",
    41: "IPv6_encapsulation",
    47: "GRE",
    50: "ESP",
    51: "AH",
    53: "DNS",
    58: "ICMPv6",
    89: "OSPF",
    132: "SCTP",
    135: "SCTP",
    136: "UDPLite",
    137: "NETBIOS-NS",
    138: "NETBIOS-DGM",
    139: "NETBIOS-SSN",
    143: "IMAP",
    161: "SNMP",
    162: "SNMP_trap",
    443: "HTTPS",
    514: "Syslog",
    636: "LDAPS",
    989: "FTPS",
    993: "IMAPS",
    995: "POP3S",
    1080: "SOCKS_proxy",
    # Add more protocols as needed
}

In [None]:
# if read from pkl is enabled, read from existing object
if READ_FROM_PKL:
    guloader_df = pd.read_pickle("../data/blog_fe/guloader.pkl")
    picabot_df = pd.read_pickle("../data/blog_fe/picabot.pkl")

In [None]:
def one_hot_port(port, df):
    new_df = pd.DataFrame()
    for protocol_port, protocol_name in network_protocols.items():
        new_df[protocol_name] = df[port].apply(
            lambda port: 1 if port == protocol_port else 0
        )
    return new_df

In [None]:
guloader_protocol_one_hot = one_hot_port("Destination Port", guloader_df)

In [None]:
guloader_protocol_one_hot

In [None]:
picabot_protocol_one_hot = one_hot_port("Destination Port", picabot_df)

In [None]:
picabot_protocol_one_hot

# Packet length ordinal encoding

In [None]:
def length_ordinal_mapping(length, low, medium, high):
    if length <= low:
        return "LOW"
    elif low < length <= medium:
        return "MEDIUM"
    return "HIGH"

In [None]:
guloader_df["Risk"] = guloader_df["Packet Length"].apply(
    lambda x: length_ordinal_mapping(x, 60, 256, 1024)
)
picabot_df["Risk"] = picabot_df["Packet Length"].apply(
    lambda x: length_ordinal_mapping(x, 60, 256, 1024)
)

In [None]:
guloader_df

# Port frequency encoding

In [None]:
frequency_encoding = mirai_df["Protocol"].value_counts(normalize=True).to_dict()
mirai_df["Protocol_freq_encoded"] = mirai_df["Protocol"].map(frequency_encoding)

In [None]:
frequency_encoding

In [None]:
mirai_df

# Embeddings

In [None]:
# Tokenize the payload column to individual words
tokenized_payloads = mirai_df["Payload"].apply(lambda x: x.lower().split())

tokenized_payloads

In [None]:
# Train Word2Vec model on tokenized payloads
word2vec_model = Word2Vec(
    sentences=tokenized_payloads, vector_size=100, window=5, min_count=1, workers=4
)

In [None]:
# Function to calculate the average word embedding for a sentence
def average_word_embedding(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    # fix size of embeddings if it is variable
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [None]:
nlp = spacy.load("en_core_web_md")

# List of cybersecurity words
cybersecurity_words = [
    "firewall",
    "malware",
    "phishing",
    "encryption",
    "authentication",
    "vulnerability",
    "patch",
    "incident",
    "antivirus",
    "cryptography",
    "update",
    "spyware",
    "verification",
]


# Function to get the word embeddings
def get_word_embeddings(word_list):
    embeddings = {}
    for word in word_list:
        embeddings[word] = nlp(word).vector
    return embeddings


# Get word embeddings for the cybersecurity words
word_embeddings = get_word_embeddings(cybersecurity_words)

# Print the word embeddings
for word, embedding in word_embeddings.items():
    print(f"{word}: {embedding}")

In [None]:
# Function to calculate cosine similarity between embeddings
def calculate_cosine_similarity(embeddings, dictionary):
    if dictionary:
        similarities = {}
    else:
        similarities = np.zeros((len(embeddings), len(embeddings)))
    for i, word1 in enumerate(embeddings):
        for j, word2 in enumerate(embeddings):
            if dictionary:
                if word1 != word2:
                    similarity = cosine_similarity(
                        [embeddings[word1]], [embeddings[word2]]
                    )[0][0]
                    similarities[(word1, word2)] = similarity
            else:
                if i != j:
                    similarity = cosine_similarity(
                        [embeddings[word1]], [embeddings[word2]]
                    )[0][0]
                    similarities[i, j] = similarity
    return similarities

In [None]:
# Get word embeddings for the cybersecurity words
word_embeddings = {word: nlp(word).vector for word in cybersecurity_words}

# Calculate cosine similarities
similarities = calculate_cosine_similarity(word_embeddings, 1)

# Print the cosine similarities
for pair, similarity in similarities.items():
    print(f"Cosine Similarity between '{pair[0]}' and '{pair[1]}': {similarity:.4f}")

In [None]:
# Calculate cosine similarities arrays for plotting
similarities = calculate_cosine_similarity(word_embeddings, 0)

# Plot the heatmap
sns.set(font_scale=1.2)
plt.figure(figsize=(10, 8))
sns.heatmap(similarities, annot=True, xticklabels=cybersecurity_words, yticklabels=cybersecurity_words, cmap="YlGnBu")
plt.title("Cosine Similarities between Cybersecurity Words")
plt.show()


In [None]:
# Apply the average_word_embedding function to create a new column 'payload_embedding'
mirai_df["payload_embedding"] = tokenized_payloads.apply(
    lambda x: average_word_embedding(x, word2vec_model)
)

# Display the resulting DataFrame
mirai_df

In [None]:
# Load the Universal Sentence Encoder
use_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
use_embed = hub.load(use_url)


# Function to generate embeddings for payloads
def generate_embeddings(payloads):
    embeddings = use_embed(payloads).numpy()
    return embeddings

In [None]:
# Generate embeddings for the "payload" column for a subset of mirai because it takes tooooooo looooong
mirai_df_subset = mirai_df.head(1000)
mirai_df_subset["payload_embedding_tensorflow"] = mirai_df_subset["Payload"].apply(
    lambda x: generate_embeddings([x])
)

mirai_df_subset