In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import ipaddress
 
from utils.pcap import pcap_to_dataframe, extract_streams



# Load data
There are two ways to load the data:

- Directly reading a `pcap` and converting it to a Pandas `DataFrame`,
- Preloading the dataframe to a `.pkl` file. For more information on pickle files check the [article from RealPython about pickle module](https://realpython.com/python-pickle-module/).

In [2]:
malicious_pcap = "./data/mirai.pcap"
benign_pcap = "./data/benign.pcapng"
malicious_pkl = "./data/mirai.pkl"
benign_pkl = "./data/benign.pkl"

In [3]:
# first time you run this, you should create your own pkl
# for security reasons we do not recommend to use untrusted pkl files
READ_FROM_PKL = True

In [4]:
if READ_FROM_PKL:
    malicious_df = pd.read_pickle(malicious_pkl)
    benign_df = pd.read_pickle(benign_pkl)
    
    malicious_stream_df = pd.read_pickle("data/mirai_stream.pkl")
    benign_stream_df = pd.read_pickle("data/benign_stream.pkl")
else:
    malicious_df = pcap_to_dataframe(malicious_pcap)
    benign_df = pcap_to_dataframe(benign_pcap)
    malicious_stream_df = extract_streams(malicious_df)
    benign_stream_df = extract_streams(benign_df)
    # save to pkl since dataframe conversion takes a long time
    malicious_df.to_pickle(malicious_pkl)
    benign_df.to_pickle(benign_pkl)

In [5]:
malicious_df.sample(n=10)

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol
435321,1540452189.236584,,,,,ARP who has 192.168.2.189 says 192.168.2.110 /...,60,
224511,1540451336.392649,,,,,ARP who has 192.168.2.38 says 192.168.2.110 / ...,60,
592139,1540452824.616165,,,,,ARP who has 192.168.2.208 says 192.168.2.110 /...,60,
472402,1540452339.438274,,,,,ARP who has 192.168.2.61 says 192.168.2.110 / ...,60,
622711,1540452948.327314,,,,,ARP who has 192.168.2.28 says 192.168.2.110 / ...,60,
161518,1540451057.475294,192.168.2.101,192.168.2.110,,,Raw,40,1.0
34032,1540447625.56945,192.168.2.113,114.215.137.159,50861.0,10240.0,Raw,48,17.0
553204,1540452666.766866,,,,,ARP who has 192.168.2.170 says 192.168.2.110 /...,60,
455819,1540452272.654175,192.168.2.1,192.168.2.109,,,"IPerror / UDPerror / DNS Qry ""b'time.windows.c...",70,1.0
59480,1540448565.304941,192.168.2.113,122.226.84.253,50861.0,10240.0,Raw,48,17.0


In [6]:
malicious_df.shape

(764137, 8)

In [7]:
# copy the dataframes to a features dataframe while omitting the packets with NaN src/dst ips/ports
malicious_features = malicious_df.dropna(subset=["Source IP", "Destination IP", "Source Port", "Destination Port"])
benign_features = benign_df.dropna(subset=["Source IP", "Destination IP", "Source Port", "Destination Port"])

In [8]:
malicious_features.sample(n=10)

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol
189369,1540451194.681921,192.168.2.113,61.220.62.219,45029.0,80.0,,40,6.0
87739,1540449588.31828,192.168.2.109,114.114.114.114,44896.0,53.0,"DNS Qry ""b'authentication.eye4.cn.'""",48,17.0
50493,1540448236.872254,192.168.2.104,192.168.2.1,1471.0,53.0,"DNS Qry ""b'10024main.broadlink.com.cn.'""",52,17.0
153098,1540451023.817472,192.168.2.109,192.168.2.1,34373.0,53.0,"DNS Qry ""b'pisr-3.eyecloud.so.'""",44,17.0
617836,1540452928.801049,192.168.2.108,61.188.37.216,32761.0,10240.0,Raw,48,17.0
238983,1540451394.874534,192.168.2.113,61.188.37.216,50861.0,10240.0,Raw,48,17.0
86179,1540449530.903291,192.168.2.1,192.168.2.115,53.0,4642.0,DNS Ans,83,17.0
8279,1540446670.383374,192.168.2.1,192.168.2.115,53.0,2659.0,DNS Ans,83,17.0
17,1540446383.558014,192.168.2.113,122.248.234.207,50861.0,10240.0,Raw,48,17.0
38424,1540447789.617415,192.168.2.110,114.114.114.114,53133.0,53.0,"DNS Qry ""b'ntp2.eye4.cn.'""",38,17.0


In [9]:
malicious_features.shape

(154090, 8)

# Numerical features
Post processing numbers to ... better numbers that describe context or condense our data information.

## Cumulative

Summarize your numerical features and give them a new meaning and utility.

In [10]:
malicious_features["src_ip_total_bytes"] = malicious_features.groupby("Source IP")[
    "Packet Length"
].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malicious_features["src_ip_total_bytes"] = malicious_features.groupby("Source IP")[


In [11]:
benign_features["src_ip_total_bytes"] = benign_features.groupby("Source IP")[
    "Packet Length"
].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benign_features["src_ip_total_bytes"] = benign_features.groupby("Source IP")[


In [12]:
malicious_features["dst_ip_total_bytes"] = malicious_features.groupby("Destination IP")[
    "Packet Length"
].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malicious_features["dst_ip_total_bytes"] = malicious_features.groupby("Destination IP")[


In [13]:
benign_features["dst_ip_total_bytes"] = benign_features.groupby("Destination IP")[
    "Packet Length"
].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benign_features["dst_ip_total_bytes"] = benign_features.groupby("Destination IP")[


In [14]:
malicious_features.sample(n=10)

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol,src_ip_total_bytes,dst_ip_total_bytes
22505,1540447197.799599,192.168.2.108,46.137.188.54,32761.0,10240.0,Raw,48,17.0,252648,71320
18711,1540447057.797787,192.168.2.108,122.248.234.207,32761.0,10240.0,Raw,48,17.0,209192,55368
699885,1540453260.294268,192.168.2.112,192.168.2.1,34357.0,53.0,"DNS Qry ""b'time-nw.nist.gov.'""",42,17.0,354340,1368391
63221,1540448696.297804,192.168.2.108,46.137.188.54,32761.0,10240.0,Raw,48,17.0,714810,201896
441063,1540452212.412578,192.168.2.108,52.24.43.67,30075.0,80.0,Padding,26,6.0,1801152,132680
126061,1540450886.802786,192.168.2.108,61.188.37.216,32761.0,10240.0,Raw,48,17.0,1391298,401048
77095,1540449195.297971,192.168.2.108,50.19.254.134,32761.0,10240.0,Raw,48,17.0,868602,243840
148922,1540451007.288963,192.168.2.108,52.25.66.250,15155.0,8280.0,Padding,26,6.0,1428696,104702
108803,1540450370.859559,192.168.2.108,52.25.66.250,48028.0,8280.0,Padding,26,6.0,1231726,90012
77357,1540449206.977751,192.168.2.115,192.168.2.1,4392.0,53.0,"DNS Qry ""b'xmpp.samsungsmartcam.com.'""",50,17.0,349623,542822


## Numerical conversions

Convert numerical features to usable numbers.

In [15]:
def ip_to_numeric(ip):
    try:
        ip_obj = ipaddress.ip_interface(ip)
        ip = int(ip_obj.network.network_address)
    except ValueError:
        ip = 0

    return ip

In [16]:
malicious_features["Numeric Source IP"] = malicious_features["Source IP"].apply(
    ip_to_numeric
)

malicious_features["Numeric Destination IP"] = malicious_features["Destination IP"].apply(
    ip_to_numeric
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malicious_features["Numeric Source IP"] = malicious_features["Source IP"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malicious_features["Numeric Destination IP"] = malicious_features["Destination IP"].apply(


In [17]:
benign_features["Numeric Source IP"] = benign_features["Source IP"].apply(
    ip_to_numeric
)

benign_features["Numeric Destination IP"] = benign_features["Destination IP"].apply(
    ip_to_numeric
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benign_features["Numeric Source IP"] = benign_features["Source IP"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benign_features["Numeric Destination IP"] = benign_features["Destination IP"].apply(


In [18]:
# remove non-numeric IPs
malicious_features.pop("Source IP")
malicious_features.pop("Destination IP")

benign_features.pop("Source IP")
benign_features.pop("Destination IP")

4           224.0.0.2
5           224.0.0.2
6           224.0.0.2
7           224.0.0.2
26          224.0.0.2
             ...     
38585     194.247.5.1
38586    194.247.5.27
38587     194.247.5.1
38588    194.247.5.27
38589     194.247.5.1
Name: Destination IP, Length: 18615, dtype: object

# Categorical features
What about the text data? We can convert those to numbers too.

## Frequency encoding
Counts the population that corresponds to a specific category. The result is still a vector of categories, however not with 0s ans 1s, but with real numbers that indicate how often the category is encountered in the data.

In [19]:
malicious_frequency_encoding = (
   malicious_features["Destination Port"].value_counts(normalize=True).to_dict()
)

In [20]:
malicious_frequency_encoding

{10240.0: 0.40615224868583294,
 53.0: 0.21641897592316178,
 80.0: 0.11450451035109352,
 23.0: 0.06143163086507885,
 21897.0: 0.046453371406320984,
 8280.0: 0.04079434096956324,
 443.0: 0.019923421377117268,
 8000.0: 0.019274449996755143,
 21047.0: 0.010643130637938867,
 8080.0: 0.01025374780972159,
 1900.0: 0.0068271789214095656,
 32100.0: 0.004536309948731261,
 50364.0: 0.0024076838211434877,
 2323.0: 0.002018300992926212,
 57206.0: 0.0018430787202284378,
 51009.0: 0.0015575313128691024,
 123.0: 0.0012330456226880395,
 46734.0: 0.000947498215328704,
 68.0: 0.0008826010772924914,
 52777.0: 0.000623012525147641,
 41313.0: 0.0002920371211629567,
 9000.0: 0.0002660782659484717,
 67.0: 0.00017522227269777403,
 5353.0: 0.0001622428450905315,
 8629.0: 0.000149263417483289,
 138.0: 0.00011681484846518268,
 137.0: 9.73457070543189e-05,
 5355.0: 5.1917710428970085e-05,
 41291.0: 4.542799662534882e-05,
 41282.0: 3.8938282821727564e-05,
 41274.0: 3.8938282821727564e-05,
 3018.0: 1.946914141086378

In [21]:
benign_frequency_encoding = (
   benign_features["Destination Port"].value_counts(normalize=True).to_dict()
)

In [22]:
malicious_features["dst_port_freq_encoded"] = malicious_features["Destination Port"].map(
    malicious_frequency_encoding
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malicious_features["dst_port_freq_encoded"] = malicious_features["Destination Port"].map(


In [23]:
benign_features["dst_port_freq_encoded"] = benign_features["Destination Port"].map(
    benign_frequency_encoding
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benign_features["dst_port_freq_encoded"] = benign_features["Destination Port"].map(


## Derivative

In [24]:
# Define a function to convert Scapy timestamps to pandas datetime
def scapy_timestamp_to_datetime(ts):
    return pd.to_datetime(
        ts.to_eng_string(), unit="s"
    )  # Convert to a format pandas understands


# Convert the Scapy timestamps to pandas datetime
malicious_features["Timestamp"] = malicious_features["Timestamp"].apply(scapy_timestamp_to_datetime).astype(int) / 10**9
benign_features["Timestamp"] = benign_features["Timestamp"].apply(scapy_timestamp_to_datetime).astype(int) / 10**9

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malicious_features["Timestamp"] = malicious_features["Timestamp"].apply(scapy_timestamp_to_datetime).astype(int) / 10**9
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benign_features["Timestamp"] = benign_features["Timestamp"].apply(scapy_timestamp_to_datetime).astype(int) / 10**9


In [25]:
malicious_features["Interarrival"] = malicious_features["Timestamp"].diff()
benign_features["Interarrival"] = benign_features["Timestamp"].diff()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  malicious_features["Interarrival"] = malicious_features["Timestamp"].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  benign_features["Interarrival"] = benign_features["Timestamp"].diff()


In [26]:
malicious_features.sample(n=5)

Unnamed: 0,Timestamp,Source Port,Destination Port,Payload,Packet Length,Protocol,src_ip_total_bytes,dst_ip_total_bytes,Numeric Source IP,Numeric Destination IP,dst_port_freq_encoded,Interarrival
112716,1540451000.0,50641.0,80.0,,40,6.0,212924,359616,3232236144,780778550,0.114505,0.30866
121140,1540451000.0,32761.0,10240.0,Raw,48,17.0,1372078,361440,3232236140,2063133391,0.406152,0.000125
161674,1540451000.0,50861.0,10240.0,Raw,48,17.0,1027683,112432,3232236145,2014854038,0.406152,0.004587
702624,1540453000.0,57457.0,53.0,"DNS Qry ""b'time-a.nist.gov.'""",41,17.0,354982,1370530,3232236144,3232236033,0.216419,0.001739
80141,1540449000.0,32761.0,10240.0,Raw,48,17.0,903594,253600,3232236140,840171142,0.406152,0.072376


In [27]:
malicious_features.dtypes

Timestamp                 float64
Source Port               float64
Destination Port          float64
Payload                    object
Packet Length               int64
Protocol                  float64
src_ip_total_bytes          int64
dst_ip_total_bytes          int64
Numeric Source IP           int64
Numeric Destination IP      int64
dst_port_freq_encoded     float64
Interarrival              float64
dtype: object

In [28]:
malicious_features.to_pickle("./data/malicious_features_numeric.pkl")
benign_features.to_pickle("./data/benign_features_numeric.pkl")

## One hot encoding
Binary encoding that creates a vector with 0s and 1s that correspond to specific categories. If your data had the category populated, mark it as 1 otherwise mark as 0. 

In [29]:
network_protocols = {
    1: "ICMP",
    6: "TCP",
    17: "UDP",
    23: "Telnet",
    41: "IPv6_encapsulation",
    47: "GRE",
    50: "ESP",
    51: "AH",
    53: "DNS",
    58: "ICMPv6",
    89: "OSPF",
    132: "SCTP",
    135: "SCTP",
    136: "UDPLite",
    137: "NETBIOS-NS",
    138: "NETBIOS-DGM",
    139: "NETBIOS-SSN",
    143: "IMAP",
    161: "SNMP",
    162: "SNMP_trap",
    443: "HTTPS",
    514: "Syslog",
    636: "LDAPS",
    989: "FTPS",
    993: "IMAPS",
    995: "POP3S",
    1080: "SOCKS_proxy",
    # Add more protocols as needed
}

In [30]:
def one_hot_port(port, df):
    new_df = pd.DataFrame()
    for protocol_port, protocol_name in network_protocols.items():
        new_df[protocol_name] = df[port].apply(
            lambda port: 1 if port == protocol_port else 0
        )
    return new_df

In [31]:
malicious_protocol_one_hot = one_hot_port("Destination Port", malicious_features)
malicious_features = pd.concat([malicious_features, malicious_protocol_one_hot], axis=1)

In [None]:
benign_protocol_one_hot = one_hot_port("Destination Port", benign_features)
benign_features = pd.concat([benign_features, benign_protocol_one_hot], axis=1)

In [None]:
malicious_features.to_pickle("./data/malicious_features.pkl")
benign_features.to_pickle("./data/benign_features.pkl")