# Visualization for GDLC

This notebook focuses on visualization of the distribution of attacks according to different complex network measures added as features to the dataset.

## Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

from src.data.dataset_info import datasets

# specifying the dataset
dataset = datasets[0]
name = dataset.name
print("dataset: {}".format(name))
path = "./datasets/preprocessed/{}.pkl".format(name)
# graph_path = "./datasets/preprocessed/graph_{}.gexf".format(name)

# loading the dataset as a Pandas dataframe
df = pd.read_pickle(path)

In [None]:
df.head()

In [None]:
# sort data chronologically
df.sort_values(dataset.timestamp_col, inplace=True)

# To add an "index" column to the sorted dataframe
df.reset_index(inplace=True, drop = True)
df.reset_index(inplace=True)

In [None]:

def ecdf(data):
    """
    Calculate the Empirical Cumulative Distribution Function (ECDF) for a dataset.
    
    The ECDF represents the proportion of data points less than or equal to a 
    certain value, thereby providing a way to visualize and understand the distribution 
    of data values.

    Parameters:
    - data: array-like, a list or array of numerical data points.

    Returns:
    - x: array, the sorted data points.
    - y: array, the ECDF value for each data point.
    """

    # Sort the data in ascending order
    x = np.sort(data)

    # Calculate the ECDF values for each data point
    # np.arange(1, len(data) + 1) creates an array of integers from 1 to the length of the data
    # Dividing by the length of the data normalizes these integers to range from 1/N to 1
    y = np.arange(1, len(data) + 1) / len(data)

    return x, y

In [None]:

plt.figure(figsize=(14, 10))
plt.rcParams['font.size'] = 18
data_column1 = df[df[dataset.label_col] == 0]["index"] # Benign records
data_column2 = df[df[dataset.label_col] == 1]["index"] # Attack records
# Compute ECDF
x1, y1 = ecdf(data_column1)
x2, y2 = ecdf(data_column2)

# Plot the ECDF
plt.plot(x1, y1, color="#3062d9", linewidth=4, marker='o', markersize=4)
plt.plot(x2, y2, color="#eb4034", linewidth=4, marker='o', markersize=4)
plt.xlabel('index')
plt.ylabel('ECDF')
# plt.title('Empirical Cumulative Distribution Function (ECDF)')

# Show the plot
plt.show()


for feature in dataset.network_features:
    print(feature)

    plt.clf()
    plt.figure(figsize=(14, 10))
    plt.rcParams['font.size'] = 18
    data_column1 = df[df[dataset.label_col] == 0][feature]
    data_column2 = df[df[dataset.label_col] == 1][feature]
    # Compute ECDF
    x1, y1 = ecdf(data_column1)
    x2, y2 = ecdf(data_column2)
    
    # Plot the ECDF
    plt.plot(x1, y1, color="#3062d9", linewidth=4, marker='o', markersize=4)
    plt.plot(x2, y2, color="#eb4034", linewidth=4, marker='o', markersize=4)
    plt.xlabel(feature)
    plt.ylabel('ECDF')
    # plt.title('Empirical Cumulative Distribution Function (ECDF)')
    
    # Show the plot
    plt.show()

