In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import ipaddress
import random

from gensim.models import Word2Vec
 
from read_pcaps import pcap_to_dataframe

# Load data
There are two ways to load the data:

- Directly reading a `pcap` and converting it to a Pandas `DataFrame`,
- Preloading the dataframe to a `.pkl` file. For more information on pickle files check the [article from RealPython about pickle module](https://realpython.com/python-pickle-module/).

In [2]:
mirai_pcap = "../data/mirai.pcap"
mirai_pkl = "../data/mirai.pkl"

In [3]:
# first time you run this, you should create your own pkl
# for security reasons we do not recommend to use untrusted pkl files
READ_FROM_PKL = True

In [4]:
if READ_FROM_PKL:
    mirai_df = pd.read_pickle(mirai_pkl)
else:
    mirai_df = pcap_to_dataframe(mirai_pcap)
    # save to pkl since dataframe conversion takes a long time
    mirai_df.to_pickle(mirai_pkl)

In [5]:
mirai_df.sample(n=10)

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol
137215,1540450942.526571,,,,,ARP who has 192.168.2.9 says 192.168.2.110 / P...,60,
729946,1540453381.798203,,,,,ARP who has 192.168.2.12 says 192.168.2.110 / ...,60,
318690,1540451716.867595,,,,,ARP who has 192.168.2.28 says 192.168.2.110 / ...,60,
613589,1540452911.635111,,,,,ARP who has 192.168.2.158 says 192.168.2.110 /...,60,
673346,1540453153.52088,,,,,ARP who has 192.168.2.92 says 192.168.2.110 / ...,60,
324323,1540451739.755524,,,,,ARP who has 192.168.2.17 says 192.168.2.110 / ...,60,
467059,1540452317.719478,,,,,ARP who has 192.168.2.73 says 192.168.2.110 / ...,60,
720418,1540453342.700147,,,,,ARP who has 192.168.2.234 says 192.168.2.110 /...,60,
405457,1540452068.578377,,,,,ARP who has 192.168.2.228 says 192.168.2.110 /...,60,
559828,1540452693.718058,,,,,ARP who has 192.168.2.125 says 192.168.2.110 /...,60,


In [6]:
mirai_df.shape

(764137, 8)

In [7]:
mirai_df_no_arp = mirai_df.dropna(subset=["Source IP", "Destination IP", "Source Port", "Destination Port"])

In [8]:
mirai_df_no_arp.sample(n=10)

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol
122499,1540450867.575965,192.168.2.108,52.25.66.250,41257.0,8280.0,Padding,26,6.0
44905,1540448030.63029,192.168.2.120,192.168.2.1,37045.0,53.0,"DNS Qry ""b'1.asia.pool.ntp.org.'""",45,17.0
218951,1540451313.831301,192.168.2.112,74.125.31.99,52352.0,80.0,,40,6.0
64284,1540448734.895802,192.168.2.113,114.215.137.159,50861.0,10240.0,Raw,48,17.0
206326,1540451262.871427,192.168.2.108,52.24.43.67,15894.0,80.0,Padding,26,6.0
423605,1540452141.512251,192.168.2.118,192.168.2.110,2323.0,21897.0,,20,6.0
674412,1540453157.600605,192.168.2.115,192.168.2.1,3295.0,53.0,"DNS Qry ""b'xmpp.samsungsmartcam.com.'""",50,17.0
464751,1540452308.43506,192.168.2.1,192.168.2.110,23.0,21897.0,Padding,26,6.0
308335,1540451675.281268,192.168.2.113,114.215.137.159,36517.0,443.0,,40,6.0
469554,1540452327.798657,192.168.2.108,50.19.254.134,32761.0,10240.0,Raw,48,17.0


In [9]:
mirai_df_no_arp.shape

(154090, 8)

# Numerical features
Post processing numbers to ... better numbers that describe context or condense our data information.

## Cumulative

Summarize your numerical features and give them a new meaning and utility.

In [10]:
mirai_features_df = mirai_df_no_arp
mirai_features_df["src_ip_total_bytes"] = mirai_df_no_arp.groupby("Source IP")[
    "Packet Length"
].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mirai_features_df["src_ip_total_bytes"] = mirai_df_no_arp.groupby("Source IP")[


In [11]:
mirai_features_df["dst_ip_total_bytes"] = mirai_df_no_arp.groupby("Destination IP")[
    "Packet Length"
].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mirai_features_df["dst_ip_total_bytes"] = mirai_df_no_arp.groupby("Destination IP")[


In [12]:
mirai_features_df.sample(n=10)

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol,src_ip_total_bytes,dst_ip_total_bytes
77276,1540449202.969292,192.168.2.115,192.168.2.1,4389.0,53.0,"DNS Qry ""b'xmpp.samsungsmartcam.com.'""",50,17.0,349257,542456
64057,1540448726.298199,192.168.2.108,61.188.37.216,32761.0,10240.0,Raw,48,17.0,724126,209608
94646,1540449844.798756,192.168.2.108,122.248.234.207,32761.0,10240.0,Raw,48,17.0,1069142,281472
49641,1540448206.352372,192.168.2.113,114.215.137.159,50861.0,10240.0,Raw,48,17.0,403889,45312
491598,1540452417.302534,192.168.2.108,50.19.254.134,32761.0,10240.0,Raw,48,17.0,1864914,521576
26624,1540447352.279599,192.168.2.112,74.125.31.99,49643.0,80.0,,40,6.0,50638,8480
106753,1540450295.417292,192.168.2.113,61.188.37.216,50861.0,10240.0,Raw,48,17.0,861818,348504
6952,1540446623.706705,192.168.2.113,120.24.59.150,48232.0,80.0,,40,6.0,56919,6280
213404,1540451291.176685,192.168.2.110,192.168.2.103,21897.0,23.0,Padding,26,6.0,247916,12464
570352,1540452736.438973,192.168.2.110,192.168.2.118,21897.0,23.0,Padding,26,6.0,443445,31998


## Numerical conversions

Convert numerical features to usable numbers.

In [13]:
def ip_to_numeric(ip):
    try:
        ip_obj = ipaddress.ip_interface(ip)
        ip = int(ip_obj.network.network_address)
    except ValueError:
        ip = 0

    return ip

In [14]:
def generate_random_ips(num_ips):
    ips = []

    # Define a list of public IP ranges and private IP ranges
    public_ranges = [
        ipaddress.IPv4Network("8.8.8.0/24"),
        ipaddress.IPv4Network("203.0.113.0/24"),
        ipaddress.IPv4Network("42.42.0.0/16"),
    ]
    private_ranges = [
        ipaddress.IPv4Network("192.168.1.0/24"),
        ipaddress.IPv4Network("10.0.0.0/8"),
        ipaddress.IPv4Network("172.16.0.0/12"),
    ]

    # Add at least two IP addresses from the same private subnet
    private_subnet = random.choice(private_ranges)
    ips.append(str(private_subnet.network_address))
    ips.append(str(private_subnet.network_address + 1))

    # Generate the remaining random IP addresses
    for _ in range(num_ips - 2):
        if random.choice([True, False]):
            # Choose a random public IP
            ip = str(
                random.choice(public_ranges).network_address + random.randint(0, 255)
            )
        else:
            # Choose a random private IP
            ip = str(
                random.choice(private_ranges).network_address + random.randint(0, 255)
            )

        ips.append(ip)

    return ips


# Generate a list of 100 random IPs
random_ips = generate_random_ips(10)

In [15]:
for ip in random_ips:
    print(f"{ip} -> {ip_to_numeric(ip)}")

172.16.0.0 -> 2886729728
172.16.0.1 -> 2886729729
42.42.0.25 -> 707395609
8.8.8.111 -> 134744175
10.0.0.55 -> 167772215
8.8.8.174 -> 134744238
172.16.0.237 -> 2886729965
10.0.0.70 -> 167772230
172.16.0.125 -> 2886729853
192.168.1.2 -> 3232235778


In [16]:
mirai_features_df["Numeric Source IP"] = mirai_features_df["Source IP"].apply(
    ip_to_numeric
)

mirai_features_df["Numeric Destination IP"] = mirai_features_df["Destination IP"].apply(
    ip_to_numeric
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mirai_features_df["Numeric Source IP"] = mirai_features_df["Source IP"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mirai_features_df["Numeric Destination IP"] = mirai_features_df["Destination IP"].apply(


In [17]:
# remove non-numeric IPs
mirai_features_df.pop("Source IP")
mirai_features_df.pop("Destination IP")

0           52.24.43.67
1          52.25.66.250
7           192.168.2.1
8         192.168.2.115
9           52.24.43.67
              ...      
764121    61.188.37.216
764122          8.8.8.8
764124      52.24.43.67
764125     52.25.66.250
764136      192.168.2.1
Name: Destination IP, Length: 154090, dtype: object

# Categorical features
What about the text data? We can convert those to numbers too.

## One hot encoding
Binary encoding that creates a vector with 0s and 1s that correspond to specific categories. If your data had the category populated, mark it as 1 otherwise mark as 0. 

In [18]:
network_protocols = {
    1: "ICMP",
    6: "TCP",
    17: "UDP",
    23: "Telnet",
    41: "IPv6_encapsulation",
    47: "GRE",
    50: "ESP",
    51: "AH",
    53: "DNS",
    58: "ICMPv6",
    89: "OSPF",
    132: "SCTP",
    135: "SCTP",
    136: "UDPLite",
    137: "NETBIOS-NS",
    138: "NETBIOS-DGM",
    139: "NETBIOS-SSN",
    143: "IMAP",
    161: "SNMP",
    162: "SNMP_trap",
    443: "HTTPS",
    514: "Syslog",
    636: "LDAPS",
    989: "FTPS",
    993: "IMAPS",
    995: "POP3S",
    1080: "SOCKS_proxy",
    # Add more protocols as needed
}

In [19]:
def one_hot_port(port, df):
    new_df = pd.DataFrame()
    for protocol_port, protocol_name in network_protocols.items():
        new_df[protocol_name] = df[port].apply(
            lambda port: 1 if port == protocol_port else 0
        )
    return new_df

In [20]:
mirai_protocol_one_hot = one_hot_port("Destination Port", mirai_df_no_arp)
mirai_features_df = pd.concat([mirai_features_df, mirai_protocol_one_hot], axis=1)

## Frequency encoding
Counts the population that corresponds to a specific category. The result is still a vector of categories, however not with 0s ans 1s, but with real numbers that indicate how often the category is encountered in the data.

In [21]:
frequency_encoding = (
   mirai_df_no_arp["Destination Port"].value_counts(normalize=True).to_dict()
)

In [22]:
frequency_encoding

{10240.0: 0.40615224868583294,
 53.0: 0.21641897592316178,
 80.0: 0.11450451035109352,
 23.0: 0.06143163086507885,
 21897.0: 0.046453371406320984,
 8280.0: 0.04079434096956324,
 443.0: 0.019923421377117268,
 8000.0: 0.019274449996755143,
 21047.0: 0.010643130637938867,
 8080.0: 0.01025374780972159,
 1900.0: 0.0068271789214095656,
 32100.0: 0.004536309948731261,
 50364.0: 0.0024076838211434877,
 2323.0: 0.002018300992926212,
 57206.0: 0.0018430787202284378,
 51009.0: 0.0015575313128691024,
 123.0: 0.0012330456226880395,
 46734.0: 0.000947498215328704,
 68.0: 0.0008826010772924914,
 52777.0: 0.000623012525147641,
 41313.0: 0.0002920371211629567,
 9000.0: 0.0002660782659484717,
 67.0: 0.00017522227269777403,
 5353.0: 0.0001622428450905315,
 8629.0: 0.000149263417483289,
 138.0: 0.00011681484846518268,
 137.0: 9.73457070543189e-05,
 5355.0: 5.1917710428970085e-05,
 41291.0: 4.542799662534882e-05,
 41282.0: 3.8938282821727564e-05,
 41274.0: 3.8938282821727564e-05,
 3018.0: 1.946914141086378

In [23]:
mirai_features_df["dst_port_freq_encoded"] = mirai_df_no_arp["Destination Port"].map(
    frequency_encoding
)

# Embeddings

The magic behind Natural Language Processing (NLP) models. One-hot is good for categories, but it does not encapsulate the meaning of words. Embeddings map words in a multi-dimensional space and they put the words with similar meanings closer to each other.

In [24]:
# Tokenize the payload column to individual words
mirai_tokenized_payloads = mirai_df_no_arp["Payload"].apply(lambda x: x.lower().split())

mirai_tokenized_payloads

0                                                 [padding]
1                                                 [padding]
7         [dns, qry, "b'xmpp.samsungsmartcam.com.speedpo...
8                                                [dns, ans]
9                                                 [padding]
                                ...                        
764121                                                [raw]
764122                   [dns, qry, "b'time.windows.com.'"]
764124                                            [padding]
764125                                            [padding]
764136    [dns, qry, "b'north-america.pool.ntp.org.speed...
Name: Payload, Length: 154090, dtype: object

In [25]:
# Train Word2Vec model on tokenized payloads
mirai_word2vec_model = Word2Vec(
    sentences=mirai_tokenized_payloads,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
)

In [26]:
# Function to calculate the average word embedding for a sentence
def average_word_embedding(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    # fix size of embeddings if it is variable
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [27]:
# Apply the average_word_embedding function to create a new column 'payload_embedding'
mirai_features_df["payload_embedding"] = mirai_tokenized_payloads.apply(
    lambda x: average_word_embedding(x, mirai_word2vec_model)
)

# Display the resulting DataFrame
mirai_features_df

Unnamed: 0,Timestamp,Source Port,Destination Port,Payload,Packet Length,Protocol,src_ip_total_bytes,dst_ip_total_bytes,Numeric Source IP,Numeric Destination IP,...,SNMP_trap,HTTPS,Syslog,LDAPS,FTPS,IMAPS,POP3S,SOCKS_proxy,dst_port_freq_encoded,payload_embedding
0,1540446382.933899,21074.0,80.0,Padding,26,6.0,26,26,3232236140,873999171,...,0,0,0,0,0,0,0,0,0.114505,"[-0.100908704, 0.017527357, -0.0027728875, 0.0..."
1,1540446382.933904,20532.0,8280.0,Padding,26,6.0,52,26,3232236140,874070778,...,0,0,0,0,0,0,0,0,0.040794,"[-0.100908704, 0.017527357, -0.0027728875, 0.0..."
7,1540446383.391651,2440.0,53.0,"DNS Qry ""b'xmpp.samsungsmartcam.com.Speedport_...",83,17.0,83,83,3232236147,3232236033,...,0,0,0,0,0,0,0,0,0.216419,"[-0.16273583, 0.14517587, -0.011447725, 0.0685..."
8,1540446383.393709,53.0,2440.0,DNS Ans,83,17.0,83,83,3232236033,3232236147,...,0,0,0,0,0,0,0,0,0.000013,"[-0.14730054, 0.13219145, 0.0085404515, 0.1037..."
9,1540446383.435821,21074.0,80.0,Padding,26,6.0,78,52,3232236140,873999171,...,0,0,0,0,0,0,0,0,0.114505,"[-0.100908704, 0.017527357, -0.0027728875, 0.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764121,1540453519.302461,32761.0,10240.0,Raw,48,17.0,2207242,633840,3232236140,1035740632,...,0,0,0,0,0,0,0,0,0.406152,"[-0.07581303, 0.0123783, 0.0064401, 0.0413997,..."
764122,1540453519.455618,35284.0,53.0,"DNS Qry ""b'time.windows.com.'""",42,17.0,241142,158024,3232236141,134744072,...,0,0,0,0,0,0,0,0,0.216419,"[-0.15819925, 0.13991414, -0.017735327, 0.0477..."
764124,1540453519.756754,47157.0,80.0,Padding,26,6.0,2207268,163490,3232236140,873999171,...,0,0,0,0,0,0,0,0,0.114505,"[-0.100908704, 0.017527357, -0.0027728875, 0.0..."
764125,1540453519.756832,46453.0,8280.0,Padding,26,6.0,2207294,163436,3232236140,874070778,...,0,0,0,0,0,0,0,0,0.040794,"[-0.100908704, 0.017527357, -0.0027728875, 0.0..."


In [28]:
# eliminate payload and keep only numerical features
mirai_features_df.pop("Payload")

0                                                   Padding
1                                                   Padding
7         DNS Qry "b'xmpp.samsungsmartcam.com.Speedport_...
8                                                  DNS Ans 
9                                                   Padding
                                ...                        
764121                                                  Raw
764122                      DNS Qry "b'time.windows.com.'" 
764124                                              Padding
764125                                              Padding
764136    DNS Qry "b'north-america.pool.ntp.org.Speedpor...
Name: Payload, Length: 154090, dtype: object