In [1]:
import pandas as pd
import numpy as np

from read_pcaps import pcap_to_dataframe

In [2]:
# define a variable that allows you to read prior saved pkl files
READ_FROM_PKL = True

In [3]:
mirai_df = pd.read_pickle("../data/blog_eda/mirai.pkl")
benign_df = pd.read_pickle("../data/blog_eda/benign.pkl")

# Protocol Label Encoding

In [4]:
protocols = ["ARP", "HTTP", "HTTPS", "FTP", "FTPS", "SMTP", "POP3", "IMAP", "Telnet", "DNS", "DHCP", "SNMP", "NTP", "SSH", "SMB", "LDAP", "SIP", "ICMP"]

In [5]:
def extract_protocol(payload):
    for protocol in protocols:
        if protocol in payload:
            return protocol

In [6]:
benign_df["payload_proto"] = benign_df["Payload"].apply(extract_protocol)
mirai_df["payload_proto"] = mirai_df["Payload"].apply(extract_protocol)

In [7]:
from sklearn.preprocessing import LabelEncoder

mirai_df["protocol_encoded"] = LabelEncoder().fit_transform(mirai_df["payload_proto"])
benign_df["protocol_encoded"] = LabelEncoder().fit_transform(benign_df["payload_proto"])

In [8]:
mirai_df.tail()

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol,payload_proto,protocol_encoded
764132,1540453519.837515,,,,,ARP who has 192.168.2.165 says 192.168.2.110 /...,60,,ARP,0
764133,1540453519.839396,,,,,ARP who has 192.168.2.166 says 192.168.2.110 /...,60,,ARP,0
764134,1540453519.840611,,,,,ARP who has 192.168.2.167 says 192.168.2.110 /...,60,,ARP,0
764135,1540453519.842369,,,,,ARP who has 192.168.2.168 says 192.168.2.110 /...,60,,ARP,0
764136,1540453519.842464,192.168.2.115,192.168.2.1,3570.0,53.0,"DNS Qry ""b'north-america.pool.ntp.org.Speedpor...",85,17.0,DNS,1


# Service one-hot encoding

Read a couple more pcaps with malware, get interesting data

In [9]:
if not READ_FROM_PKL:
    guloader = rdpcap("../data/blog_fe/2023-06-26-guloader-or-modiloader-style-infection-for-Remcos-RAT.pcap")
    picabot = rdpcap("../data/blog_fe/2023-12-18-TA577-Pikabot-infection-with-Cobalt-Strike.pcap")

In [10]:
if not READ_FROM_PKL:
    guloader_df = pcap_to_dataframe(guloader)
    picabot_df = pcap_to_dataframe(picabot)

In [11]:
# save to pkl since dataframe conversion takes a long time
if not READ_FROM_PKL:
    guloader_df.to_pickle("../data/blog_fe/guloader.pkl")
    picabot_df.to_pickle("../data/blog_fe/picabot.pkl")

In [12]:
network_protocols = {
    1: "ICMP",
    6: "TCP",
    17: "UDP",
    23: "Telnet",
    41: "IPv6_encapsulation",
    47: "GRE",
    50: "ESP",
    51: "AH",
    53: "DNS",
    58: "ICMPv6",
    89: "OSPF",
    132: "SCTP",
    135: "SCTP",
    136: "UDPLite",
    137: "NETBIOS-NS",
    138: "NETBIOS-DGM",
    139: "NETBIOS-SSN",
    143: "IMAP",
    161: "SNMP",
    162: "SNMP_trap",
    443: "HTTPS",
    514: "Syslog",
    636: "LDAPS",
    989: "FTPS",
    993: "IMAPS",
    995: "POP3S",
    1080: "SOCKS_proxy",
    # Add more protocols as needed
}

In [13]:
# if read from pkl is enabled, read from existing object
if READ_FROM_PKL:
    guloader_df = pd.read_pickle("../data/blog_fe/guloader.pkl")
    picabot_df = pd.read_pickle("../data/blog_fe/picabot.pkl")

In [14]:
def one_hot_port(port, df):
    new_df = pd.DataFrame()
    for protocol_port, protocol_name in network_protocols.items():
        new_df[protocol_name] = df[port].apply(lambda port: 1 if port == protocol_port else 0)
    return new_df

In [15]:
guloader_protocol_one_hot = one_hot_port("Destination Port", guloader_df)

In [16]:
guloader_protocol_one_hot

Unnamed: 0,ICMP,TCP,UDP,Telnet,IPv6_encapsulation,GRE,ESP,AH,DNS,ICMPv6,...,IMAP,SNMP,SNMP_trap,HTTPS,Syslog,LDAPS,FTPS,IMAPS,POP3S,SOCKS_proxy
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7155,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7156,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7157,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7158,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
picabot_protocol_one_hot = one_hot_port("Destination Port", picabot_df)

In [18]:
picabot_protocol_one_hot

Unnamed: 0,ICMP,TCP,UDP,Telnet,IPv6_encapsulation,GRE,ESP,AH,DNS,ICMPv6,...,IMAP,SNMP,SNMP_trap,HTTPS,Syslog,LDAPS,FTPS,IMAPS,POP3S,SOCKS_proxy
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15350,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15353,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Packet length ordinal encoding

In [19]:
def length_ordinal_mapping(length, low, medium, high):
    if length <= low:
        return"LOW"
    elif low < length <= medium:
        return "MEDIUM"
    return "HIGH"

In [20]:
guloader_df["Risk"] = guloader_df["Packet Length"].apply(
    lambda x: length_ordinal_mapping(x, 60, 256, 1024)
)
picabot_df["Risk"] = picabot_df["Packet Length"].apply(
    lambda x: length_ordinal_mapping(x, 60, 256, 1024)
)

In [21]:
guloader_df

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol,Risk
0,1687822487.869545,10.6.26.101,10.6.26.1,52310,53,"DNS Qry ""b'acrobat.adobe.com.'""",43,17,LOW
1,1687822487.895684,10.6.26.1,10.6.26.101,53,52310,"DNS Ans ""b'acrobat.adobe.com.i.edgekey.net.'""",157,17,MEDIUM
2,1687822487.981506,10.6.26.101,23.198.7.187,49700,443,,32,6,LOW
3,1687822488.007546,23.198.7.187,10.6.26.101,443,49700,,24,6,LOW
4,1687822488.007728,10.6.26.101,23.198.7.187,49700,443,,20,6,LOW
...,...,...,...,...,...,...,...,...,...
7155,1687825049.166782,10.6.26.101,194.187.251.91,49795,12603,Raw,91,6,MEDIUM
7156,1687825049.166982,194.187.251.91,10.6.26.101,12603,49795,,20,6,LOW
7157,1687825079.168643,194.187.251.91,10.6.26.101,12603,49795,Raw,62,6,MEDIUM
7158,1687825079.170813,10.6.26.101,194.187.251.91,49795,12603,Raw,81,6,MEDIUM


# Port frequency encoding

# Payload Embeddings