In [1]:
import pandas as pd
import numpy as np
import sweetviz as sv
import ipaddress


from scapy.all import PcapReader, IP, TCP, UDP, ICMP
from scipy.stats import ttest_ind, kstest, norm, skew, kurtosis, zscore
from sklearn.linear_model import LinearRegression
from skimpy import skim
from summarytools import dfSummary

In [2]:
pcap_reader = PcapReader("../data/blog_show_data/mirai.pcap")
type(pcap_reader)

scapy.utils.PcapNgReader

# Preprocess

- convert data to streams
- collect some numbers

In [3]:
# Create an empty list to store the data
# data = []

# # Iterate through the packets in the pcap file
# for packet in pcap_reader:
#     # Get the source and destination IP addresses
#     if packet.haslayer(IP):
#         src_ip = packet[IP].src
#         dst_ip = packet[IP].dst
#         protocol = packet[IP].proto
#     else:
#         src_ip = None
#         dst_ip = None
#         protocol = None
    
#     # Get the source and destination ports and payload
#     if packet.haslayer(TCP):
#         src_port = packet[TCP].sport
#         dst_port = packet[TCP].dport
#         payload = str(packet[TCP].payload)
#         packet_len = len(packet[TCP])
#     elif packet.haslayer(UDP):
#         src_port = packet[UDP].sport
#         dst_port = packet[UDP].dport
#         payload = str(packet[UDP].payload)
#         packet_len = len(packet[UDP])
#     elif packet.haslayer(ICMP):
#         payload = str(packet[ICMP].payload)
#         packet_len = len(packet[ICMP])
#         src_port = None
#         dst_port = None
#     else:
#         src_port = None
#         dst_port = None
#         payload = str(packet.payload)
#         packet_len = len(packet)
    
#     # Append the data to the list
#     data.append([packet.time, src_ip, dst_ip, src_port, dst_port, payload, packet_len, protocol])

# # Convert the list to a pandas dataframe
# mirai_df = pd.DataFrame(data, columns=['Timestamp', 'Source IP', 'Destination IP', 'Source Port', 'Destination Port', 'Payload', 'Packet Length', 'Protocol'])

# mirai_df


In [4]:
# mirai_df.to_pickle("../data/bsides_aug/mirai.pkl")
mirai_df = pd.read_pickle("../data/bsides_aug/mirai.pkl")

In [5]:
mirai_df

Unnamed: 0,Timestamp,Source IP,Destination IP,Source Port,Destination Port,Payload,Packet Length,Protocol
0,1540450947.65791,,,,,ARP who has 192.168.2.66 says 192.168.2.110 / ...,60,
1,1540450947.663658,,,,,ARP who has 192.168.2.67 says 192.168.2.110 / ...,60,
2,1540450947.664393,,,,,ARP who has 192.168.2.68 says 192.168.2.110 / ...,60,
3,1540450947.668903,,,,,ARP who has 192.168.2.69 says 192.168.2.110 / ...,60,
4,1540450947.670141,,,,,ARP who has 192.168.2.70 says 192.168.2.110 / ...,60,
...,...,...,...,...,...,...,...,...
625615,1540453519.837515,,,,,ARP who has 192.168.2.165 says 192.168.2.110 /...,60,
625616,1540453519.839396,,,,,ARP who has 192.168.2.166 says 192.168.2.110 /...,60,
625617,1540453519.840611,,,,,ARP who has 192.168.2.167 says 192.168.2.110 /...,60,
625618,1540453519.842369,,,,,ARP who has 192.168.2.168 says 192.168.2.110 /...,60,


In [6]:
# Create an empty list to store stream data as separate dataframes
dfs = []

# Group packets by src/dst IP and src/dst port
grouped = mirai_df.groupby(['Source IP', 'Destination IP', 'Source Port', 'Destination Port', 'Protocol'])

# Iterate through each group to extract stream data
for name, group in grouped:
    # Get source/destination IP, port, and protocol
    src_ip, dst_ip, src_port, dst_port, proto = name
    
    # Get number of packets, total length, and duration of the stream
    num_packets = len(group)
    total_length = group['Packet Length'].sum()
    start_time = group['Timestamp'].min()
    end_time = group['Timestamp'].max()
    duration = float(end_time - start_time)
    
    # Create a new dataframe with the stream data
    stream_df = pd.DataFrame({'Source IP': [src_ip],
                              'Destination IP': [dst_ip],
                              'Source Port': [src_port],
                              'Destination Port': [dst_port],
                              'Protocol': [proto],
                              'Number of Packets': [num_packets],
                              'Total Length': [total_length],
                              'Duration': [duration]})
    
    # Add the new dataframe to the list
    dfs.append(stream_df)

# Concatenate all the dataframes in the list into one dataframe
stream_df = pd.concat(dfs, ignore_index=True)

# Print the new dataframe with stream data
print(stream_df)


           Source IP   Destination IP  Source Port  Destination Port  \
0            0.0.0.0  255.255.255.255         68.0              67.0   
1        192.168.2.1    192.168.2.110         23.0           21897.0   
2        192.168.2.1    192.168.2.110         23.0           46734.0   
3        192.168.2.1    192.168.2.110         23.0           50364.0   
4        192.168.2.1    192.168.2.111         53.0           32806.0   
...              ...              ...          ...               ...   
14133  192.168.2.126      54.154.5.26      29783.0           32100.0   
14134  192.168.2.126      54.154.5.26      29858.0           32100.0   
14135  192.168.2.126          8.8.8.8       3126.0              53.0   
14136  192.168.2.126          8.8.8.8       3130.0              53.0   
14137  192.168.2.196    192.168.2.101      53781.0           51009.0   

       Protocol  Number of Packets  Total Length     Duration  
0          17.0                  2           332     0.304461  
1      

# EDA

## Descriptive statistics & data

- Describe columns and data types
- Descriptive statistics
  -  count, 
  -  mean, 
  -  standard deviation, 
  -  minimum, 
  -  25th percentile, 
  -  median (50th percentile), 
  -  75th percentile, and 
  -  maximum

In [7]:
# describe, summarize etc.
stream_df.columns

Index(['Source IP', 'Destination IP', 'Source Port', 'Destination Port',
       'Protocol', 'Number of Packets', 'Total Length', 'Duration'],
      dtype='object')

In [8]:
stream_df.dtypes

Source IP             object
Destination IP        object
Source Port          float64
Destination Port     float64
Protocol             float64
Number of Packets      int64
Total Length           int64
Duration             float64
dtype: object

In [9]:
# descriptive statistics
stream_df.describe()

Unnamed: 0,Source Port,Destination Port,Protocol,Number of Packets,Total Length,Duration
count,14138.0,14138.0,14138.0,14138.0,14138.0,14138.0
mean,32980.104895,2152.375088,13.219479,4.705687,199.615009,14.185579
std,21650.44527,6092.466512,5.224492,72.311029,3057.301765,153.306208
min,0.0,23.0,6.0,1.0,26.0,0.0
25%,3206.25,53.0,6.0,1.0,40.0,0.0
50%,40048.5,53.0,17.0,1.0,74.0,0.0
75%,51555.0,443.0,17.0,3.0,104.0,2.85028
max,64633.0,63749.0,17.0,2862.0,135840.0,2571.501515


In [10]:
# correlation matrix for numerical values in dataframe
stream_df.corr()

  stream_df.corr()


Unnamed: 0,Source Port,Destination Port,Protocol,Number of Packets,Total Length,Duration
Source Port,1.0,-0.135555,-0.311714,-0.013628,-0.012811,-0.016093
Destination Port,-0.135555,1.0,-0.214269,0.057033,0.057138,0.063952
Protocol,-0.311714,-0.214269,1.0,-0.013221,0.005097,0.005727
Number of Packets,-0.013628,0.057033,-0.013221,1.0,0.959277,0.620179
Total Length,-0.012811,0.057138,0.005097,0.959277,1.0,0.603591
Duration,-0.016093,0.063952,0.005727,0.620179,0.603591,1.0


## Hypothesis testing

- Is the difference between two groups or variables statistically significant?
- Use t-test to compare means of two groups
  - assumes that data follows normal distribution
- Types of variables
  - dependent: the effect of a phenomenon. For example, how does number of HTTP requests mean that a network is compromised?
  - independent: the cause. The number of HTTP requests affects whether a network is compromised.

In [11]:

def hypothesis_testing(df, col1, col2):
    group1 = df[col1]
    group2 = df[col2]
    pvalue = ttest_ind(group1, group2)[1]
    if pvalue < 0.05:
        return "The difference between {} and {} is statistically significant (p < 0.05)".format(col1, col2)
    else:
        return "The difference between {} and {} is not statistically significant (p >= 0.05)".format(col1, col2)


In [12]:
hypothesis_testing(stream_df, 'Number of Packets', 'Total Length')

'The difference between Number of Packets and Total Length is statistically significant (p < 0.05)'

## Regression Analysis

- Models relationship between a dependent variable and one or more independent variables
- Linear regression
  - fit data in line
  - calculate coefficients

In [13]:
def regression_analysis(df, x_cols, y_col):
    X = df[x_cols].values.reshape(-1, len(x_cols))
    y = df[y_col].values.reshape(-1, 1)
    model = LinearRegression().fit(X, y)
    r_sq = model.score(X, y)
    coef = model.coef_
    return {"R-squared": r_sq, "Coefficients": coef}


In [14]:
regression_analysis(stream_df, ['Number of Packets', 'Duration'], 'Total Length')

{'R-squared': 0.9203348633587376,
 'Coefficients': array([[40.18879986,  0.280892  ]])}

## Kolmogorov-Smirnov test

- compare two sample distributions
- useful for fitting to a distribution
- test if two samples from a population:
  - came from a distribution
  - belong to the same distribution
- Uses metric `D`
  - max absolute difference between empirical distribution function of the samples and cumulative distribution of the reference distribution
- Null hypothesis: 
  - samples came from the reference distribution
  - samples came from the same distribution

In [15]:
def kolmogorov_smirnov_test(df, column):
    sample = df[column].values
    _, pvalue = kstest(sample, norm.cdf, args=(sample.mean(), sample.std()))
    if pvalue < 0.05:
        return "The distribution of {} is significantly different from a normal distribution (p < 0.05)".format(column)
    else:
        return "The distribution of {} is not significantly different from a normal distribution (p >= 0.05)".format(column)


In [16]:
kolmogorov_smirnov_test(stream_df, 'Total Length')

'The distribution of Total Length is significantly different from a normal distribution (p < 0.05)'

## Skewness and Kyrtosis

- information about the shape of the distribution
- Skewness: measure the degree of asymmetry
  - symmetric: equally balanced around its mean
  - asymmetric: not equally balanced
  - positive skewness: distribution longer on the right side
  - negative skewness: longer on the left
  - 0: completely symmetric
- Kurtosis: peakedness of distribution
  - high: sharp peak, long tails
  - low: flat peak, short tails
  - ex. normal distribution has kurtosis 3, mesokurtic
    - `> 3` leptokurtic
    - `< 3` platykurtic

In [17]:
def skewness_kurtosis(df):
    result = {}
    for col in df.select_dtypes(include=[np.number]).columns:
        result[col + "_skewness"] = skew(df[col])
        result[col + "_kurtosis"] = kurtosis(df[col])
    return result


In [18]:
skewness_kurtosis(stream_df)

{'Source Port_skewness': -0.5085045967000315,
 'Source Port_kurtosis': -1.3242099807012182,
 'Destination Port_skewness': 5.300748261730351,
 'Destination Port_kurtosis': 35.39481020672699,
 'Protocol_skewness': -0.6582611625651278,
 'Protocol_kurtosis': -1.566692241858407,
 'Number of Packets_skewness': 31.29479930443702,
 'Number of Packets_kurtosis': 1089.3978856143829,
 'Total Length_skewness': 35.36865267114758,
 'Total Length_kurtosis': 1427.7647108932754,
 'Duration_skewness': 13.816872833840316,
 'Duration_kurtosis': 200.28279037205246}

## Outliers

- observation that significantly differs from others in a dataset
- Causes
  - measurement errors
  - extreme rare values
- significant impact in statistical analysis
- measurements
  - z-score: `(x - mean) / std_dev`
  - IQR method: this method identifies outliers as observations that are below `Q1 - 1.5IQR` or above `Q3 + 1.5IQR`, where Q1 and Q3 are the first and third quartiles, and IQR is the interquartile range (the difference between Q3 and Q1).
  - visual inspection

In [19]:
def detect_outliers_zscore(df, column, threshold=3):
    zscores = np.abs(zscore(df[column]))
    return df[zscores > threshold]

In [20]:
outliers = detect_outliers_zscore(stream_df, "Total Length", threshold=3)
print(outliers)

           Source IP   Destination IP  Source Port  Destination Port  \
1        192.168.2.1    192.168.2.110         23.0           21897.0   
1611     192.168.2.1  239.255.255.250       1900.0            1900.0   
1613   192.168.2.101    192.168.2.110         23.0           21897.0   
1624   192.168.2.103    192.168.2.110         23.0           21897.0   
2083   192.168.2.107    192.168.2.110         23.0           21897.0   
2087   192.168.2.108   122.226.84.253      32761.0           10240.0   
2088   192.168.2.108  122.248.234.207      32761.0           10240.0   
2089   192.168.2.108    46.137.188.54      32761.0           10240.0   
2090   192.168.2.108    50.19.254.134      32761.0           10240.0   
3225   192.168.2.108    61.188.37.216      32761.0           10240.0   
6219   192.168.2.110      192.168.2.1      21897.0              23.0   
6884   192.168.2.110    192.168.2.101      21897.0              23.0   
6887   192.168.2.110    192.168.2.103      21897.0              

# Feature Engineering

In [21]:
# convert ip address to numeric values
def ip_to_numeric(ip):
    ip_obj = ipaddress.ip_interface(ip)
    return int(ip_obj.network.network_address)

In [22]:
stream_df['Source IP Numeric'] = stream_df['Source IP'].apply(ip_to_numeric)
stream_df['Destination IP Numeric'] = stream_df['Destination IP'].apply(ip_to_numeric)

In [23]:
stream_df_numeric = stream_df.drop(columns=["Source IP", "Destination IP"])

In [24]:
stream_df["Duration"] = stream_df_numeric["Duration"].astype(float)

In [25]:
stream_df_numeric.dtypes

Source Port               float64
Destination Port          float64
Protocol                  float64
Number of Packets           int64
Total Length                int64
Duration                  float64
Source IP Numeric           int64
Destination IP Numeric      int64
dtype: object

## Summaries & Visualizations

In [26]:
skim(stream_df_numeric)

In [27]:
dfSummary(stream_df_numeric)

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,Source Port [float64],Mean (sd) : 32980.1 (21650.4) min < med < max: 0.0 < 40048.5 < 64633.0 IQR (CV) : 48348.8 (1.5),"10,709 distinct values",,0 (0.0%)
2,Destination Port [float64],Mean (sd) : 2152.4 (6092.5) min < med < max: 23.0 < 53.0 < 63749.0 IQR (CV) : 390.0 (0.4),"1,643 distinct values",,0 (0.0%)
3,Protocol [float64],1. 17.0 2. 6.0,"9,279 (65.6%) 4,859 (34.4%)",,0 (0.0%)
4,Number of Packets [int64],Mean (sd) : 4.7 (72.3) min < med < max: 1.0 < 1.0 < 2862.0 IQR (CV) : 2.0 (0.1),39 distinct values,,0 (0.0%)
5,Total Length [int64],Mean (sd) : 199.6 (3057.3) min < med < max: 26.0 < 74.0 < 135840.0 IQR (CV) : 64.0 (0.1),136 distinct values,,0 (0.0%)
6,Duration [float64],Mean (sd) : 14.2 (153.3) min < med < max: 0.0 < 0.0 < 2571.5 IQR (CV) : 2.9 (0.1),"5,153 distinct values",,0 (0.0%)
7,Source IP Numeric [int64],Mean (sd) : 3232007510.6 (27183732.8) min < med < max: 0.0 < 3232236142.0 < 3232236228.0 IQR (CV) : 4.0 (118.9),19 distinct values,,0 (0.0%)
8,Destination IP Numeric [int64],Mean (sd) : 2213405764.9 (1137748786.3) min < med < max: 134744072.0 < 2063133391.0 < 4294967295.0 IQR (CV) : 2196495401.0 (1.9),47 distinct values,,0 (0.0%)


In [28]:
my_report = sv.analyze(stream_df_numeric)
my_report.show_html()

  all_source_names = [cur_name for cur_name, cur_series in source_df.iteritems()]
  filtered_series_names_in_source = [cur_name for cur_name, cur_series in source_df.iteritems()


                                             |          | [  0%]   00:00 -> (? left)

  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  for item in category_counts.iteritems():
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()
  stats["mad"] = series.mad()


Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


# Model Training

In [29]:
# what are we training for? benign vs malicious

[0517/213155.196468:ERROR:file_io_posix.cc(152)] open /home/drx/.config/BraveSoftware/Brave-Browser/Crash Reports/pending/83df2ce0-9359-4e34-8a6c-58e503ea954c.lock: File exists (17)


Opening in existing browser session.
