In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import ipaddress
import random
import spacy
 
from utils.pcap import pcap_to_dataframe, extract_streams

# Load data
There are two ways to load the data:

- Directly reading a `pcap` and converting it to a Pandas `DataFrame`,
- Preloading the dataframe to a `.pkl` file. For more information on pickle files check the [article from RealPython about pickle module](https://realpython.com/python-pickle-module/).

In [2]:
malicious_pcap = "./data/mirai.pcap"
benign_pcap = "./data/benign.pcapng"
malicious_pkl = "./data/mirai.pkl"
benign_pkl = "./data/benign.pkl"

In [3]:
# first time you run this, you should create your own pkl
# for security reasons we do not recommend to use untrusted pkl files
READ_FROM_PKL = True

In [4]:
if READ_FROM_PKL:
    malicious_df = pd.read_pickle(malicious_pkl)
    benign_df = pd.read_pickle(benign_pkl)
    
    malicious_stream_df = pd.read_pickle("data/mirai_stream.pkl")
    benign_stream_df = pd.read_pickle("data/benign_stream.pkl")
else:
    malicious_df = pcap_to_dataframe(malicious_pcap)
    benign_df = pcap_to_dataframe(benign_pcap)
    malicious_stream_df = extract_streams(malicious_df)
    benign_stream_df = extract_streams(benign_df)
    # save to pkl since dataframe conversion takes a long time
    malicious_df.to_pickle(malicious_pkl)
    benign_df.to_pickle(benign_pkl)

In [5]:
malicious_df.sample(n=10)

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol
76818,1540449185.273706,192.168.2.113,122.226.84.253,50861.0,10240.0,Raw,48,17.0
15154,1540446923.557335,192.168.2.109,8.8.8.8,52306.0,53.0,"DNS Qry ""b'ntp.eye4.cn.'""",37,17.0
637501,1540453008.346641,,,,,ARP who has 192.168.2.93 says 192.168.2.110 / ...,60,
679725,1540453179.429209,,,,,ARP who has 192.168.2.19 says 192.168.2.110 / ...,60,
360150,1540451884.811579,,,,,ARP who has 192.168.2.134 says 192.168.2.110 /...,60,
631552,1540452984.472333,,,,,ARP who has 192.168.2.246 says 192.168.2.110 /...,60,
709991,1540453301.044527,,,,,ARP who has 192.168.2.95 says 192.168.2.110 / ...,60,
491856,1540452418.371961,,,,,ARP who has 192.168.2.74 says 192.168.2.110 / ...,60,
156895,1540451038.712409,,,,,ARP who has 192.168.2.56 says 192.168.2.110 / ...,60,
710351,1540453302.952058,,,,,ARP who has 192.168.2.46 says 192.168.2.110 / ...,60,


In [6]:
malicious_df.shape

(764137, 8)

In [7]:
# copy the dataframes to a features dataframe while omitting the packets with NaN src/dst ips/ports
malicious_features = malicious_df.dropna(subset=["Source IP", "Destination IP", "Source Port", "Destination Port"])
benign_features = benign_df.dropna(subset=["Source IP", "Destination IP", "Source Port", "Destination Port"])

In [8]:
malicious_features.sample(n=10)

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol
85911,1540449523.801973,192.168.2.108,61.188.37.216,32761.0,10240.0,Raw,48,17.0
433630,1540452182.147603,192.168.2.112,74.125.31.99,52964.0,80.0,,40,6.0
654544,1540453076.961463,192.168.2.118,192.168.2.110,23.0,21897.0,,20,6.0
56754,1540448467.9086,192.168.2.115,192.168.2.1,3876.0,53.0,"DNS Qry ""b'xmpp.samsungsmartcam.com.'""",50,17.0
415205,1540452107.798843,192.168.2.108,46.137.188.54,32761.0,10240.0,Raw,48,17.0
466101,1540452313.632408,192.168.2.1,192.168.2.110,23.0,21897.0,Padding,26,6.0
515912,1540452516.057457,192.168.2.113,120.24.59.150,50861.0,10240.0,Raw,48,17.0
697675,1540453251.847634,192.168.2.109,8.8.8.8,48403.0,53.0,"DNS Qry ""b'time.windows.com.'""",42,17.0
307772,1540451672.553495,192.168.2.108,52.24.43.67,15245.0,80.0,Padding,26,6.0
473076,1540452342.051051,192.168.2.112,74.125.31.99,53072.0,80.0,,40,6.0


In [9]:
malicious_features.shape

(154090, 8)

# Numerical features
Post processing numbers to ... better numbers that describe context or condense our data information.

## Cumulative

Summarize your numerical features and give them a new meaning and utility.

In [10]:
malicious_features["src_ip_total_bytes"] = malicious_features.groupby("Source IP")[
    "Packet Length"
].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malicious_features["src_ip_total_bytes"] = malicious_features.groupby("Source IP")[


In [11]:
benign_features["src_ip_total_bytes"] = benign_features.groupby("Source IP")[
    "Packet Length"
].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benign_features["src_ip_total_bytes"] = benign_features.groupby("Source IP")[


In [13]:
malicious_features["dst_ip_total_bytes"] = malicious_features.groupby("Destination IP")[
    "Packet Length"
].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malicious_features["dst_ip_total_bytes"] = malicious_features.groupby("Destination IP")[


In [14]:
benign_features["dst_ip_total_bytes"] = benign_features.groupby("Destination IP")[
    "Packet Length"
].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benign_features["dst_ip_total_bytes"] = benign_features.groupby("Destination IP")[


In [15]:
malicious_features.sample(n=10)

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol,src_ip_total_bytes,dst_ip_total_bytes
146710,1540450997.361384,192.168.2.110,192.168.2.103,50364.0,23.0,Padding,26,6.0,205118,3627
25459,1540447307.848149,192.168.2.108,52.24.43.67,36798.0,80.0,Padding,26,6.0,286698,21034
55448,1540448420.472489,192.168.2.113,120.24.59.150,50861.0,10240.0,Raw,48,17.0,451137,49544
711696,1540453308.162958,192.168.2.113,122.226.84.253,43423.0,443.0,,40,6.0,1522748,552328
90308,1540449684.796769,192.168.2.108,61.188.37.216,32761.0,10240.0,Raw,48,17.0,1019914,294256
81781,1540449367.810698,192.168.2.108,122.226.84.253,32761.0,10240.0,Raw,48,17.0,922526,238864
649373,1540453056.800214,192.168.2.108,122.248.234.207,32761.0,10240.0,Raw,48,17.0,2062962,542832
50453,1540448235.297059,192.168.2.108,50.19.254.134,32761.0,10240.0,Raw,48,17.0,572338,161352
320448,1540451724.183024,192.168.2.112,61.188.37.216,34655.0,8000.0,,40,6.0,275372,475208
112476,1540450504.844399,192.168.2.109,8.8.8.8,56018.0,53.0,"DNS Qry ""b'authentication.eye4.cn.'""",48,17.0,139222,91444


## Numerical conversions

Convert numerical features to usable numbers.

In [16]:
def ip_to_numeric(ip):
    try:
        ip_obj = ipaddress.ip_interface(ip)
        ip = int(ip_obj.network.network_address)
    except ValueError:
        ip = 0

    return ip

In [17]:
malicious_features["Numeric Source IP"] = malicious_features["Source IP"].apply(
    ip_to_numeric
)

malicious_features["Numeric Destination IP"] = malicious_features["Destination IP"].apply(
    ip_to_numeric
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malicious_features["Numeric Source IP"] = malicious_features["Source IP"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malicious_features["Numeric Destination IP"] = malicious_features["Destination IP"].apply(


In [18]:
benign_features["Numeric Source IP"] = benign_features["Source IP"].apply(
    ip_to_numeric
)

benign_features["Numeric Destination IP"] = benign_features["Destination IP"].apply(
    ip_to_numeric
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benign_features["Numeric Source IP"] = benign_features["Source IP"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benign_features["Numeric Destination IP"] = benign_features["Destination IP"].apply(


In [19]:
# remove non-numeric IPs
malicious_features.pop("Source IP")
malicious_features.pop("Destination IP")

benign_features.pop("Source IP")
benign_features.pop("Destination IP")

4           224.0.0.2
5           224.0.0.2
6           224.0.0.2
7           224.0.0.2
26          224.0.0.2
             ...     
38585     194.247.5.1
38586    194.247.5.27
38587     194.247.5.1
38588    194.247.5.27
38589     194.247.5.1
Name: Destination IP, Length: 18615, dtype: object

# Categorical features
What about the text data? We can convert those to numbers too.

## One hot encoding
Binary encoding that creates a vector with 0s and 1s that correspond to specific categories. If your data had the category populated, mark it as 1 otherwise mark as 0. 

In [20]:
network_protocols = {
    1: "ICMP",
    6: "TCP",
    17: "UDP",
    23: "Telnet",
    41: "IPv6_encapsulation",
    47: "GRE",
    50: "ESP",
    51: "AH",
    53: "DNS",
    58: "ICMPv6",
    89: "OSPF",
    132: "SCTP",
    135: "SCTP",
    136: "UDPLite",
    137: "NETBIOS-NS",
    138: "NETBIOS-DGM",
    139: "NETBIOS-SSN",
    143: "IMAP",
    161: "SNMP",
    162: "SNMP_trap",
    443: "HTTPS",
    514: "Syslog",
    636: "LDAPS",
    989: "FTPS",
    993: "IMAPS",
    995: "POP3S",
    1080: "SOCKS_proxy",
    # Add more protocols as needed
}

In [21]:
def one_hot_port(port, df):
    new_df = pd.DataFrame()
    for protocol_port, protocol_name in network_protocols.items():
        new_df[protocol_name] = df[port].apply(
            lambda port: 1 if port == protocol_port else 0
        )
    return new_df

In [22]:
malicious_protocol_one_hot = one_hot_port("Destination Port", malicious_features)
malicious_features = pd.concat([malicious_features, malicious_protocol_one_hot], axis=1)

In [23]:
benign_protocol_one_hot = one_hot_port("Destination Port", benign_features)
benign_features = pd.concat([benign_features, benign_protocol_one_hot], axis=1)

In [24]:
malicious_features.dtypes

Timestamp                  object
Source Port               float64
Destination Port          float64
Payload                    object
Packet Length               int64
Protocol                  float64
src_ip_total_bytes          int64
dst_ip_total_bytes          int64
Numeric Source IP           int64
Numeric Destination IP      int64
ICMP                        int64
TCP                         int64
UDP                         int64
Telnet                      int64
IPv6_encapsulation          int64
GRE                         int64
ESP                         int64
AH                          int64
DNS                         int64
ICMPv6                      int64
OSPF                        int64
SCTP                        int64
UDPLite                     int64
NETBIOS-NS                  int64
NETBIOS-DGM                 int64
NETBIOS-SSN                 int64
IMAP                        int64
SNMP                        int64
SNMP_trap                   int64
HTTPS         

## Frequency encoding
Counts the population that corresponds to a specific category. The result is still a vector of categories, however not with 0s ans 1s, but with real numbers that indicate how often the category is encountered in the data.

In [25]:
malicious_frequency_encoding = (
   malicious_features["Destination Port"].value_counts(normalize=True).to_dict()
)

In [26]:
malicious_frequency_encoding

{10240.0: 0.40615224868583294,
 53.0: 0.21641897592316178,
 80.0: 0.11450451035109352,
 23.0: 0.06143163086507885,
 21897.0: 0.046453371406320984,
 8280.0: 0.04079434096956324,
 443.0: 0.019923421377117268,
 8000.0: 0.019274449996755143,
 21047.0: 0.010643130637938867,
 8080.0: 0.01025374780972159,
 1900.0: 0.0068271789214095656,
 32100.0: 0.004536309948731261,
 50364.0: 0.0024076838211434877,
 2323.0: 0.002018300992926212,
 57206.0: 0.0018430787202284378,
 51009.0: 0.0015575313128691024,
 123.0: 0.0012330456226880395,
 46734.0: 0.000947498215328704,
 68.0: 0.0008826010772924914,
 52777.0: 0.000623012525147641,
 41313.0: 0.0002920371211629567,
 9000.0: 0.0002660782659484717,
 67.0: 0.00017522227269777403,
 5353.0: 0.0001622428450905315,
 8629.0: 0.000149263417483289,
 138.0: 0.00011681484846518268,
 137.0: 9.73457070543189e-05,
 5355.0: 5.1917710428970085e-05,
 41291.0: 4.542799662534882e-05,
 41282.0: 3.8938282821727564e-05,
 41274.0: 3.8938282821727564e-05,
 3018.0: 1.946914141086378

In [27]:
benign_frequency_encoding = (
   benign_features["Destination Port"].value_counts(normalize=True).to_dict()
)

In [28]:
malicious_features["dst_port_freq_encoded"] = malicious_features["Destination Port"].map(
    malicious_frequency_encoding
)

In [29]:
benign_features["dst_port_freq_encoded"] = benign_features["Destination Port"].map(
    benign_frequency_encoding
)

## Derivative

In [31]:
# Define a function to convert Scapy timestamps to pandas datetime
def scapy_timestamp_to_datetime(ts):
    return pd.to_datetime(
        ts.to_eng_string(), unit="s"
    )  # Convert to a format pandas understands


# Convert the Scapy timestamps to pandas datetime
malicious_features["Timestamp"] = malicious_features["Timestamp"].apply(scapy_timestamp_to_datetime)
benign_features["Timestamp"] = benign_features["Timestamp"].apply(scapy_timestamp_to_datetime)

In [32]:
malicious_features["Interarrival"] = malicious_features["Timestamp"].diff()
benign_features["Interarrival"] = benign_features["Timestamp"].diff()

In [38]:
malicious_features.sample(n=5)

Unnamed: 0,Timestamp,Source Port,Destination Port,Payload,Packet Length,Protocol,src_ip_total_bytes,dst_ip_total_bytes,Numeric Source IP,Numeric Destination IP,...,SNMP_trap,HTTPS,Syslog,LDAPS,FTPS,IMAPS,POP3S,SOCKS_proxy,dst_port_freq_encoded,Interarrival
609671,2018-10-25 07:34:55.974798918,45869.0,80.0,,40,6.0,1430910,38600,3232236145,1037844187,...,0,0,0,0,0,0,0,0,0.114505,0 days 00:00:00.000000954
279115,2018-10-25 07:12:36.800153971,32761.0,10240.0,Raw,48,17.0,1599518,450560,3232236140,780778550,...,0,0,0,0,0,0,0,0,0.406152,0 days 00:00:00.000215054
99408,2018-10-25 06:47:04.301630974,32761.0,10240.0,Raw,48,17.0,1124534,291040,3232236140,2061653245,...,0,0,0,0,0,0,0,0,0.406152,0 days 00:00:00.000000954
53466,2018-10-25 06:19:06.206826925,50861.0,10240.0,Raw,48,17.0,434025,47640,3232236145,2014854038,...,0,0,0,0,0,0,0,0,0.406152,0 days 00:00:00.007299900
17114,2018-10-25 05:56:36.298422098,32761.0,10240.0,Raw,48,17.0,190024,49104,3232236140,2061653245,...,0,0,0,0,0,0,0,0,0.406152,0 days 00:00:00.000002146


In [35]:
malicious_features.dtypes

Timestamp                  datetime64[ns]
Source Port                       float64
Destination Port                  float64
Payload                            object
Packet Length                       int64
Protocol                          float64
src_ip_total_bytes                  int64
dst_ip_total_bytes                  int64
Numeric Source IP                   int64
Numeric Destination IP              int64
ICMP                                int64
TCP                                 int64
UDP                                 int64
Telnet                              int64
IPv6_encapsulation                  int64
GRE                                 int64
ESP                                 int64
AH                                  int64
DNS                                 int64
ICMPv6                              int64
OSPF                                int64
SCTP                                int64
UDPLite                             int64
NETBIOS-NS                        

In [39]:
malicious_features.to_pickle("./data/malicious_features.pkl")
benign_features.to_pickle("./data/benign_features.pkl")