In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import ttest_ind, zscore
from summarytools import dfSummary

# Load Engineered data

In [2]:
malicious_features = pd.read_pickle("data/malicious_features_numeric.pkl")
benign_features = pd.read_pickle("data/benign_features_numeric.pkl")

In [None]:
malicious_features

In [None]:
benign_features

In [None]:
malicious_features.pop("Payload")
benign_features.pop("Payload")

# EDA

Exploratory Data Analysis approaches the dataset as a black box that we need to visualize and analyze statistically with the following goals:
- get insights about our data
- test hypotheses
- decide on models and further processing, such as feature engineering.

EDA can be performed for benign and malicious data. Here we are looking at EDA only for malicious data, however the same functions can be applied to benign.

## Descriptive statistics & data

- Describe columns and data types
- Descriptive statistics
  -  count, 
  -  mean, 
  -  standard deviation, 
  -  minimum, 
  -  25th percentile, 
  -  median (50th percentile), 
  -  75th percentile, and 
  -  maximum

In [None]:
malicious_features.describe()

In [None]:
benign_features.describe()

In [None]:
mirai_correlation_matrix = malicious_features.corr()
mirai_correlation_matrix

In [None]:
benign_correlation_matrix = benign_features.corr()
benign_correlation_matrix

In [10]:
# Create a heatmap
def correlation_heatmap(correlation_matrix):
    plt.figure(figsize=(8, 6))  # Adjust the figure size as needed
    sns.set(font_scale=1.2)  # Adjust the font size as needed

    # Customize the heatmap
    sns.heatmap(
        correlation_matrix,
        annot=True,
        cmap="coolwarm",
        square=True,
        xticklabels=[
            "Timestamp",
            "Source Port",
            "Destination Port",
            "Packet Length",
            "Protocol",
            "src_ip_total_bytes",
            "dst_ip_total_bytes",
            "Number of Packets",
            "Source IP numeric",
            "Destination IP numeric",
            "dst_port_freq_encoded",
            "Interarrival",
        ],
        yticklabels=[
            "Timestamp",
            "Source Port",
            "Destination Port",
            "Packet Length",
            "Protocol",
            "src_ip_total_bytes",
            "dst_ip_total_bytes",
            "Number of Packets",
            "Source IP numeric",
            "Destination IP numeric",
            "dst_port_freq_encoded",
            "Interarrival",
        ],
    )

    plt.title("Correlation Heatmap")
    plt.show()

In [None]:
correlation_heatmap(mirai_correlation_matrix)

In [None]:
correlation_heatmap(benign_correlation_matrix)

## Hypothesis testing

- Is the difference between two groups or variables statistically significant?
- Use t-test to compare means of two groups
  - assumes that data follows normal distribution
- Types of variables
  - dependent: the effect of a phenomenon. For example, how does number of HTTP requests mean that a network is compromised?
  - independent: the cause. The number of HTTP requests affects whether a network is compromised.

In [13]:
def hypothesis_testing(df, col1, col2):
    group1 = df[col1]
    group2 = df[col2]
    pvalue = ttest_ind(group1, group2)[1]
    if pvalue < 0.05:
        return "The difference between {} and {} is statistically significant (p < 0.05)".format(
            col1, col2
        )
    else:
        return "The difference between {} and {} is not statistically significant (p >= 0.05)".format(
            col1, col2
        )

In [None]:
hypothesis_testing(malicious_features, "dst_port_freq_encoded", "Packet Length")

In [None]:
hypothesis_testing(benign_features, "dst_port_freq_encoded", "Packet Length")

## Outliers

- observation that significantly differs from others in a dataset
- Causes
  - measurement errors
  - extreme rare values
- significant impact in statistical analysis
- measurements
  - z-score: `(x - mean) / std_dev`
  - IQR method: this method identifies outliers as observations that are below `Q1 - 1.5IQR` or above `Q3 + 1.5IQR`, where Q1 and Q3 are the first and third quartiles, and IQR is the interquartile range (the difference between Q3 and Q1).
  - visual inspection

In [16]:
def detect_outliers_zscore(df, column, threshold=3):
    zscores = np.abs(zscore(df[column]))
    return df[zscores > threshold]

In [None]:
outliers = detect_outliers_zscore(malicious_features, "Packet Length", threshold=3)
print(outliers)

In [None]:
outliers = detect_outliers_zscore(benign_features, "Packet Length", threshold=3)
print(outliers)

In [19]:
def plot_outliers_multiple_views(df, column, threshold=3):
    """
    Creates a comprehensive view of outliers using multiple plots:
    1. Box plot
    2. Scatter plot with z-scores
    3. Distribution plot with outlier regions marked
    
    Parameters:
    df (pandas.DataFrame): Input dataframe
    column (str): Column name to analyze
    threshold (float): Z-score threshold for outlier detection
    """
    # Calculate z-scores
    z_scores = zscore(df[column])
    df_with_z = df.copy()
    df_with_z['zscore'] = z_scores
    df_with_z['is_outlier'] = np.abs(z_scores) > threshold
    
    # Create a figure with three subplots
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 15))
    
    # 1. Box Plot
    sns.boxplot(x=df[column], ax=ax1)
    ax1.set_title('Box Plot with Outliers')
    
    # 2. Scatter Plot with z-scores
    scatter_colors = ['blue' if not outlier else 'red' for outlier in df_with_z['is_outlier']]
    ax2.scatter(range(len(df_with_z)), df_with_z[column], c=scatter_colors, alpha=0.5)
    ax2.axhline(y=df[column].mean(), color='green', linestyle='--', label='Mean')
    ax2.set_title('Scatter Plot (Red Points are Outliers)')
    ax2.set_xlabel('Index')
    ax2.set_ylabel('Value')
    ax2.legend()
    
    # 3. Distribution Plot with Outlier Regions
    sns.kdeplot(data=df[column], ax=ax3)
    outliers = df_with_z[df_with_z['is_outlier']][column]
    if len(outliers) > 0:
        ax3.hist(outliers, bins=10, alpha=0.5, color='red', density=True)
    ax3.set_title('Distribution with Outliers Highlighted')
    
    # Add threshold lines to distribution plot
    mean = df[column].mean()
    std = df[column].std()
    ax3.axvline(mean + threshold * std, color='r', linestyle='--', alpha=0.5)
    ax3.axvline(mean - threshold * std, color='r', linestyle='--', alpha=0.5)
    
    plt.tight_layout()
    return fig

def plot_outlier_details(df, column, threshold=3):
    """
    Creates a detailed analysis plot focusing on the outliers' z-scores
    
    Parameters:
    df (pandas.DataFrame): Input dataframe
    column (str): Column name to analyze
    threshold (float): Z-score threshold for outlier detection
    """
    # Calculate z-scores
    z_scores = zscore(df[column])
    df_with_z = df.copy()
    df_with_z['zscore'] = z_scores
    df_with_z['is_outlier'] = np.abs(z_scores) > threshold
    
    # Create figure
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
    
    # 1. Z-score distribution
    sns.histplot(data=df_with_z, x='zscore', ax=ax1)
    ax1.axvline(x=threshold, color='r', linestyle='--', label=f'Threshold (+{threshold})')
    ax1.axvline(x=-threshold, color='r', linestyle='--', label=f'Threshold (-{threshold})')
    ax1.set_title('Z-score Distribution')
    ax1.legend()
    
    # 2. Z-score vs Original Value scatter
    non_outliers = df_with_z[~df_with_z['is_outlier']]
    outliers = df_with_z[df_with_z['is_outlier']]
    
    ax2.scatter(non_outliers['zscore'], non_outliers[column], 
                c='blue', label='Normal', alpha=0.5)
    ax2.scatter(outliers['zscore'], outliers[column], 
                c='red', label='Outliers', alpha=0.5)
    ax2.axvline(x=threshold, color='r', linestyle='--', alpha=0.3)
    ax2.axvline(x=-threshold, color='r', linestyle='--', alpha=0.3)
    ax2.set_xlabel('Z-score')
    ax2.set_ylabel('Original Value')
    ax2.set_title('Z-score vs Original Value')
    ax2.legend()
    
    plt.tight_layout()
    return fig


In [None]:
# Create both visualizations
fig1 = plot_outliers_multiple_views(malicious_features, "Packet Length", threshold=3)
fig2 = plot_outlier_details(malicious_features, "Packet Length", threshold=3)

# Display the plots
plt.show()

# Summaries & Visualizations
Using packages `dfSummary` and `sweetviz`, we create a wealth of visualizations with minimal code. These visualizations offer valuable insights for data exploration.

In [None]:
dfSummary(malicious_features)

In [None]:
dfSummary(benign_features)

In [None]:
# import sweetviz as sv
# known error, branch unmerged https://github.com/fbdesignpro/sweetviz/pull/178
# my_report = sv.analyze(malicious_features)
# # this shows and saves the html to the specific path. Make sure the directory html exists.
# my_report.show_html(filepath="html/malicious.html")

In [None]:
# my_report = sv.analyze(benign_features)
# # this shows and saves the html to the specific path. Make sure the directory html exists.
# my_report.show_html(filepath="html/benign.html")