In [1]:
import pandas as pd
import numpy as np

from scapy.all import PcapReader
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, CountVectorizer

from read_pcaps import pcap_to_dataframe

In [2]:
# define a variable that allows you to read prior saved pkl files
READ_FROM_PKL = True

In [3]:
if READ_FROM_PKL:
    mirai_df = pd.read_pickle("../data/blog_eda/mirai.pkl")
    benign_df = pd.read_pickle("../data/blog_eda/benign.pkl")
else:
    pcap_reader_mirai = PcapReader("../data/blog_eda/mirai.pcap")
    pcap_reader_benign = PcapReader("../data/blog_eda/benign.pcapng")
    mirai_df = pcap_to_dataframe(pcap_reader_mirai)
    benign_df = pcap_to_dataframe(pcap_reader_benign)

In [None]:
# read features saved, add new ones
mirai_features_df = pd.read_pickle("../data/blog_fe/mirai_features.pkl")
benign_features_df = pd.read_pickle("../data/blog_fe/benign_features.pkl")

In [4]:
# drop rows with None
mirai_df = mirai_df.dropna()
benign_df = benign_df.dropna()

In [5]:
mirai_df["Protocol"].sample(n=10)

752891    17.0
687375    17.0
14876      6.0
115427    17.0
71349     17.0
508098    17.0
9061       6.0
71761     17.0
94845     17.0
52081     17.0
Name: Protocol, dtype: float64

# Protocol Label Encoding

In [6]:
protocols = [
    "ARP",
    "HTTP",
    "HTTPS",
    "FTP",
    "FTPS",
    "SMTP",
    "POP3",
    "IMAP",
    "Telnet",
    "DNS",
    "DHCP",
    "SNMP",
    "NTP",
    "SSH",
    "SMB",
    "LDAP",
    "SIP",
    "ICMP",
]

In [7]:
def extract_protocol(payload):
    for protocol in protocols:
        if protocol in payload:
            return protocol

In [8]:
benign_df["payload_proto"] = benign_df["Payload"].apply(extract_protocol)
mirai_df["payload_proto"] = mirai_df["Payload"].apply(extract_protocol)

In [9]:
mirai_df["payload_proto_encoded"] = LabelEncoder().fit_transform(mirai_df["payload_proto"])
benign_df["payload_proto_encoded"] = LabelEncoder().fit_transform(benign_df["payload_proto"])

In [10]:
mirai_df.sample(n=10)

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol,payload_proto,payload_proto_encoded
534344,1540452590.917684,192.168.2.1,192.168.2.115,53.0,2904.0,DNS Ans,83,17.0,DNS,0
21897,1540447176.800026,192.168.2.108,50.19.254.134,32761.0,10240.0,Raw,48,17.0,,2
427513,1540452157.182646,192.168.2.113,122.248.234.207,43480.0,8080.0,,40,6.0,,2
15083,1540446920.377369,192.168.2.112,74.125.31.99,49346.0,80.0,,40,6.0,,2
42704,1540447946.984856,192.168.2.113,210.61.248.232,34583.0,80.0,,40,6.0,,2
94967,1540449858.062473,192.168.2.108,52.24.43.67,48197.0,80.0,Padding,26,6.0,,2
115318,1540450610.042468,192.168.2.108,52.24.43.67,8257.0,80.0,Padding,26,6.0,,2
662539,1540453109.412688,192.168.2.113,114.215.137.159,50861.0,10240.0,Raw,48,17.0,,2
30872,1540447508.86563,192.168.2.126,192.168.2.1,3126.0,53.0,"DNS Qry ""b'www.baidu.com.'""",39,17.0,DNS,0
21119,1540447146.800747,192.168.2.108,46.137.188.54,32761.0,10240.0,Raw,48,17.0,,2


In [12]:
mirai_features_df = pd.concat([mirai_features_df, mirai_df["payload_proto_encoded"]], axis=1)
benign_features_df = pd.concat([benign_features_df, benign_df["payload_proto_encoded"]], axis=1)

# Service one-hot encoding

Read a couple more pcaps with malware, get interesting data

In [13]:
if not READ_FROM_PKL:
    guloader = rdpcap(
        "../data/blog_fe/2023-06-26-guloader-or-modiloader-style-infection-for-Remcos-RAT.pcap"
    )
    picabot = rdpcap(
        "../data/blog_fe/2023-12-18-TA577-Pikabot-infection-with-Cobalt-Strike.pcap"
    )

In [14]:
if not READ_FROM_PKL:
    guloader_df = pcap_to_dataframe(guloader)
    picabot_df = pcap_to_dataframe(picabot)

In [15]:
# save to pkl since dataframe conversion takes a long time
if not READ_FROM_PKL:
    guloader_df.to_pickle("../data/blog_fe/guloader.pkl")
    picabot_df.to_pickle("../data/blog_fe/picabot.pkl")

In [16]:
network_protocols = {
    1: "ICMP",
    6: "TCP",
    17: "UDP",
    23: "Telnet",
    41: "IPv6_encapsulation",
    47: "GRE",
    50: "ESP",
    51: "AH",
    53: "DNS",
    58: "ICMPv6",
    89: "OSPF",
    132: "SCTP",
    135: "SCTP",
    136: "UDPLite",
    137: "NETBIOS-NS",
    138: "NETBIOS-DGM",
    139: "NETBIOS-SSN",
    143: "IMAP",
    161: "SNMP",
    162: "SNMP_trap",
    443: "HTTPS",
    514: "Syslog",
    636: "LDAPS",
    989: "FTPS",
    993: "IMAPS",
    995: "POP3S",
    1080: "SOCKS_proxy",
    # Add more protocols as needed
}

In [17]:
# if read from pkl is enabled, read from existing object
if READ_FROM_PKL:
    guloader_df = pd.read_pickle("../data/blog_fe/guloader.pkl")
    picabot_df = pd.read_pickle("../data/blog_fe/picabot.pkl")

In [18]:
def one_hot_port(port, df):
    new_df = pd.DataFrame()
    for protocol_port, protocol_name in network_protocols.items():
        new_df[protocol_name] = df[port].apply(
            lambda port: 1 if port == protocol_port else 0
        )
    return new_df

In [19]:
guloader_protocol_one_hot = one_hot_port("Destination Port", guloader_df)

In [20]:
guloader_protocol_one_hot.head()

Unnamed: 0,ICMP,TCP,UDP,Telnet,IPv6_encapsulation,GRE,ESP,AH,DNS,ICMPv6,...,IMAP,SNMP,SNMP_trap,HTTPS,Syslog,LDAPS,FTPS,IMAPS,POP3S,SOCKS_proxy
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [21]:
picabot_protocol_one_hot = one_hot_port("Destination Port", picabot_df)

In [22]:
picabot_protocol_one_hot

Unnamed: 0,ICMP,TCP,UDP,Telnet,IPv6_encapsulation,GRE,ESP,AH,DNS,ICMPv6,...,IMAP,SNMP,SNMP_trap,HTTPS,Syslog,LDAPS,FTPS,IMAPS,POP3S,SOCKS_proxy
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15350,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15353,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
mirai_protocol_one_hot = one_hot_port("Destination Port", mirai_df)
benign_protocol_one_hot = one_hot_port("Destination Port", benign_df)

In [24]:
mirai_features_df = pd.concat(
    [mirai_features_df, mirai_protocol_one_hot], axis=1
)
benign_features_df = pd.concat(
    [benign_features_df, benign_protocol_one_hot], axis=1
)

# Packet length ordinal encoding

In [25]:
def length_ordinal_mapping(length, low, medium):
    if length <= low:
        return "LOW"
    elif low < length <= medium:
        return "MEDIUM"
    return "HIGH"

In [26]:
guloader_df["Risk"] = guloader_df["Packet Length"].apply(
    lambda x: length_ordinal_mapping(x, 60, 256)
)
picabot_df["Risk"] = picabot_df["Packet Length"].apply(
    lambda x: length_ordinal_mapping(x, 60, 256)
)

In [27]:
picabot_df

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol,Risk
0,1702920834.113353,10.0.0.101,10.0.0.10,51024.0,53.0,"DNS Qry ""b'tsdandassociates.co.sz.'""",48,17.0,LOW
1,1702920834.848200,10.0.0.10,10.0.0.101,53.0,51024.0,"DNS Ans ""41.185.8.61""",64,17.0,MEDIUM
2,1702920834.883431,10.0.0.101,41.185.8.61,55051.0,80.0,,32,6.0,LOW
3,1702920835.135889,10.0.0.101,41.185.8.61,55052.0,80.0,,32,6.0,LOW
4,1702920835.164383,41.185.8.61,10.0.0.101,80.0,55051.0,,32,6.0,LOW
...,...,...,...,...,...,...,...,...,...
15350,1702926481.959381,10.0.0.101,95.179.247.197,55286.0,13782.0,Padding,26,6.0,LOW
15351,1702926481.960310,10.0.0.101,95.179.247.197,55286.0,13782.0,Raw,718,6.0,HIGH
15352,1702926482.169573,95.179.247.197,10.0.0.101,13782.0,55286.0,Padding,26,6.0,LOW
15353,1702926483.166017,95.179.247.197,10.0.0.101,13782.0,55286.0,Raw,444,6.0,HIGH


In [28]:
mirai_features_df["Risk"] = mirai_df["Packet Length"].apply(
    lambda x: length_ordinal_mapping(x, 60, 256)
)
benign_features_df["Risk"] = benign_df["Packet Length"].apply(
    lambda x: length_ordinal_mapping(x, 60, 256)
)

# Port frequency encoding

In [29]:
frequency_encoding = guloader_df["Destination Port"].value_counts(normalize=True).to_dict()
guloader_df["dst_port_freq_encoded"] = guloader_df["Destination Port"].map(frequency_encoding)

In [30]:
frequency_encoding

{49700: 0.6353351955307263,
 443: 0.12974860335195532,
 49782: 0.05907821229050279,
 49794: 0.05111731843575419,
 80: 0.03212290502793296,
 49792: 0.02988826815642458,
 49793: 0.02988826815642458,
 49795: 0.01759776536312849,
 12603: 0.009497206703910615,
 49780: 0.0016759776536312849,
 49781: 0.0016759776536312849,
 53: 0.0008379888268156424,
 49796: 0.0006983240223463687,
 52310: 0.00013966480446927373,
 51178: 0.00013966480446927373,
 56039: 0.00013966480446927373,
 55167: 0.00013966480446927373,
 59835: 0.00013966480446927373,
 57561: 0.00013966480446927373}

In [31]:
guloader_df

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol,Risk,dst_port_freq_encoded
0,1687822487.869545,10.6.26.101,10.6.26.1,52310,53,"DNS Qry ""b'acrobat.adobe.com.'""",43,17,LOW,0.000838
1,1687822487.895684,10.6.26.1,10.6.26.101,53,52310,"DNS Ans ""b'acrobat.adobe.com.i.edgekey.net.'""",157,17,MEDIUM,0.000140
2,1687822487.981506,10.6.26.101,23.198.7.187,49700,443,,32,6,LOW,0.129749
3,1687822488.007546,23.198.7.187,10.6.26.101,443,49700,,24,6,LOW,0.635335
4,1687822488.007728,10.6.26.101,23.198.7.187,49700,443,,20,6,LOW,0.129749
...,...,...,...,...,...,...,...,...,...,...
7155,1687825049.166782,10.6.26.101,194.187.251.91,49795,12603,Raw,91,6,MEDIUM,0.009497
7156,1687825049.166982,194.187.251.91,10.6.26.101,12603,49795,,20,6,LOW,0.017598
7157,1687825079.168643,194.187.251.91,10.6.26.101,12603,49795,Raw,62,6,MEDIUM,0.017598
7158,1687825079.170813,10.6.26.101,194.187.251.91,49795,12603,Raw,81,6,MEDIUM,0.009497


In [32]:
mirai_features_df["dst_port_freq_encoded"] = mirai_df["Destination Port"].map(frequency_encoding)
benign_features_df["dst_port_freq_encoded"] = benign_df["Destination Port"].map(frequency_encoding)

# Bag of words

## Binary embedding

In [33]:
# Define a CountVectorizer with binary encoding
vectorizer = CountVectorizer(binary=True, lowercase=True, stop_words=None)

# Fit and transform the payloads
X = vectorizer.fit_transform(mirai_df["Payload"])

# Convert the sparse matrix to a dense binary matrix
binary_encoding = np.array(X.toarray())

In [34]:
# each column corresponds to the unique n-gram (word) that appears in a payload
print("Binary Encoding:")
print(binary_encoding[1000])

Binary Encoding:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


# Save features dataframe

In [35]:
# save new features
mirai_features_df.to_pickle("../data/blog_fe/mirai_features.pkl")
benign_features_df.to_pickle("../data/blog_fe/benign_features.pkl")