In [1]:
import os
import json

import networkx as nx
import numpy as np
import pandas as pd
from scipy.stats import entropy, kurtosis, skew

from src.dataset.dataset_info import datasets
from src.numpy_encoder import NumpyEncoder


In [2]:
# name = "cic_ton_iot_5_percent"
# name = "cic_ton_iot"
# name = "cic_ids_2017_5_percent"
# name = "cic_ids_2017"
# name = "cic_bot_iot"
# name = "cic_ton_iot_modified"
# name = "nf_ton_iotv2_modified"
# name = "ccd_inid_modified"
# name = "nf_uq_nids_modified"
# name = "edge_iiot"
name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
# name = "nf_uq_nids"
# name = "x_iiot"

dataset = datasets[name]

In [3]:
df = pd.read_parquet(dataset.path)

In [4]:
class GraphsType:
    def __init__(self, name, nx_type, graph = None, with_ports = False):
        self.name = name
        self.nx_type = nx_type
        self.graph = graph
        self.with_ports = with_ports

In [5]:
results = {}

data_s = "dataset"

new_file_name = "df_properties_new.json"
graphs_types = [
    GraphsType("multi_di_graph", nx.MultiDiGraph),
    # GraphsType("multi_di_graph_with_ports", nx.MultiDiGraph, with_ports = True),
    # GraphsType("di_graph", nx.DiGraph),
    # GraphsType("di_graph_with_ports", nx.DiGraph, with_ports = True),
]

results["name"] = name
results[data_s] = {}

for g in graphs_types:
    results[g.name] = {}

## Classes Gini Coefficient

In [6]:
# Function to calculate the Gini coefficient
def gini_coefficient(values):
    sorted_values = np.sort(values)
    n = len(sorted_values)
    cumulative_values = np.cumsum(sorted_values)
    gini = (n + 1 - 2 * np.sum(cumulative_values) / np.sum(sorted_values)) / n
    return gini

# Multi-class Gini Coefficient
class_counts = df[dataset.class_col].value_counts()
class_proportions = class_counts.values / class_counts.values.sum()
multi_class_gini = gini_coefficient(class_proportions)

print("Class Counts:")
print(class_counts)
print("Class Proportions:", class_proportions)
print("Multi-class Gini Coefficient:", multi_class_gini)

results[data_s]["Class Counts / Proportions"] = {
    class_name: {
        "count": int(count),
        "proportion": float(proportion)
    }
    for class_name, count, proportion in zip(class_counts.index, class_counts.values, class_proportions)
}
# results[data_s]["Class Proportions"] = class_proportions

# Binary Classification Gini Coefficient
label_counts = df[dataset.label_col].value_counts()
label_proportions = label_counts.values / label_counts.values.sum()
binary_gini = gini_coefficient(label_proportions)

print("Label Counts:")
print(label_counts)
print("Label Proportions:", label_proportions)
print("Binary Classification Gini Coefficient:", binary_gini)


results[data_s]["Multi-class Gini Coefficient"] = multi_class_gini
results[data_s]["Binary Classification Gini Coefficient"] = binary_gini

total_count = len(df)
results[data_s]["length"] = total_count
num_benign = len(df[df[dataset.label_col] == 0])
num_attack = len(df[df[dataset.label_col] == 1])

results[data_s]["num_benign"] = num_benign
results[data_s]["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

results[data_s]["num_attack"] = num_attack
results[data_s]["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

results[data_s]["attacks"] = list(df[dataset.class_col].unique())

# Interpretation:
# - A Gini coefficient closer to 0 indicates balanced distribution.
# - A Gini coefficient closer to 1 indicates imbalanced distribution.


Class Counts:
Attack
Benign                      6682887
DDoS attacks-LOIC-HTTP       297364
DoS attacks-Hulk             108129
SSH-Bruteforce                71148
Infilteration                 59505
DoS attacks-GoldenEye         32582
DoS attacks-Slowloris         17109
Bot                           15498
DoS attacks-SlowHTTPTest      14116
FTP-BruteForce                14116
DDOS attack-LOIC-UDP           1667
DDOS attack-HOIC                230
Brute Force -Web                173
Brute Force -XSS                101
SQL Injection                    36
Name: count, dtype: int64
Class Proportions: [9.13629080e-01 4.06531485e-02 1.47825032e-02 9.72676656e-03
 8.13503182e-03 4.45434177e-03 2.33900108e-03 2.11875848e-03
 1.92982286e-03 1.92982286e-03 2.27898463e-04 3.14436992e-05
 2.36511302e-05 1.38078853e-05 4.92162248e-06]
Multi-class Gini Coefficient: 0.9041280427532231
Label Counts:
Label
0    6682887
1     631774
Name: count, dtype: int64
Label Proportions: [0.91362908 0.08637092]


In [7]:
results

{'name': 'nf_cse_cic_ids2018',
 'dataset': {'Class Counts / Proportions': {'Benign': {'count': 6682887,
    'proportion': 0.9136290800079457},
   'DDoS attacks-LOIC-HTTP': {'count': 297364,
    'proportion': 0.04065314851911797},
   'DoS attacks-Hulk': {'count': 108129, 'proportion': 0.014782503249296175},
   'SSH-Bruteforce': {'count': 71148, 'proportion': 0.00972676655828616},
   'Infilteration': {'count': 59505, 'proportion': 0.008135031821707116},
   'DoS attacks-GoldenEye': {'count': 32582,
    'proportion': 0.00445434176648788},
   'DoS attacks-Slowloris': {'count': 17109,
    'proportion': 0.002339001082893657},
   'Bot': {'count': 15498, 'proportion': 0.0021187584769820503},
   'DoS attacks-SlowHTTPTest': {'count': 14116,
    'proportion': 0.0019298228585029435},
   'FTP-BruteForce': {'count': 14116, 'proportion': 0.0019298228585029435},
   'DDOS attack-LOIC-UDP': {'count': 1667,
    'proportion': 0.00022789846310034052},
   'DDOS attack-HOIC': {'count': 230, 'proportion': 3.14

In [8]:
def class_pairs(df_cs, source_ip, destination_ip, class_column, results_dict, graph_name, folder_path):
    print("====================")
    print("====================")
    print(graph_name)
    
    os.makedirs(folder_path, exist_ok=True)

    # Initialize lists to store results
    same_class_pairs = {}
    mixed_class_pairs = []

    # Group by source and destination IP addresses
    for (source, destination), group in df_cs.groupby([source_ip, destination_ip]):
        unique_classes = group[class_column].unique()
        if len(unique_classes) == 1:
            # All records have the same class
            class_label = str(unique_classes[0])
            if class_label not in same_class_pairs:
                same_class_pairs[class_label] = []
            same_class_pairs[class_label].append({
                'node_pair': (source, destination),
                'num_instances': len(group)
            })
        else:
            # Mixed class scenario
            class_counts = group[class_column].value_counts().to_dict()
            total_instances = len(group)
            class_percentages = {str(cls): count / total_instances for cls, count in class_counts.items()}
            mixed_class_pairs.append({
                'node_pair': (source, destination),
                'class_counts': class_counts,
                'class_percentages': class_percentages
            })


    # Output results
    # print("Node pairs with the same class:")
    # for class_label, pairs in same_class_pairs.items():
    #     print(f"Class {class_label}: {pairs}")

    # print("\nNode pairs with mixed classes:")
    # for mixed_pair in mixed_class_pairs:
    #     print(mixed_pair)
    with open(os.path.join(folder_path, f"{graph_name}_same_class_pairs.json"), "w") as f:
        f.writelines(json.dumps(same_class_pairs, cls=NumpyEncoder))
        
    with open(os.path.join(folder_path, f"{graph_name}_mixed_class_pairs.json"), "w") as f:
        f.writelines(json.dumps(mixed_class_pairs, cls=NumpyEncoder))

    # Total counts
    total_same_class_pairs = sum(len(pairs) for pairs in same_class_pairs.values())
    total_mixed_class_pairs = len(mixed_class_pairs)

    print("\nTotal number of same class pairs:", total_same_class_pairs)
    print("Total number of mixed class pairs:", total_mixed_class_pairs)
    
    results_dict[graph_name]["total_same_class_pairs"] = total_same_class_pairs
    results_dict[graph_name]["total_mixed_class_pairs"] = total_mixed_class_pairs

    # Interpretation:
    # - `same_class_pairs` contains node pairs with consistent classes across all records, including the number of instances.
    # - `mixed_class_pairs` contains node pairs with mixed classes, the counts and percentages for each class.
    # - Total counts provide an overview of the dataset's class consistency.


In [9]:
folder_path_classes = os.path.join("datasets", name, "class_pairs")

for g in graphs_types:
    if not g.with_ports:
        class_pairs(df, dataset.src_ip_col, dataset.dst_ip_col, dataset.class_col, results, g.name, folder_path_classes)

multi_di_graph

Total number of same class pairs: 464278
Total number of mixed class pairs: 2147


## Centrality Skewness

In [10]:
for g in graphs_types:
    if not g.with_ports:
        g.graph = nx.from_pandas_edgelist(df, dataset.src_ip_col, dataset.dst_ip_col, edge_attr=[dataset.label_col, dataset.class_num_col], create_using=g.nx_type)

In [11]:
if any(g.with_ports for g in graphs_types):
    df[dataset.src_port_col] = df[dataset.src_port_col].astype(float).astype(int).astype(str) # to remove the decimal point
    df[dataset.src_ip_col] = df[dataset.src_ip_col] + ':' + df[dataset.src_port_col]

    df[dataset.dst_port_col] = df[dataset.dst_port_col].astype(float).astype(int).astype(str) # to remove the decimal point
    df[dataset.dst_ip_col] = df[dataset.dst_ip_col] + ':' + df[dataset.dst_port_col]

In [12]:
for g in graphs_types:
    if g.with_ports:
        g.graph = nx.from_pandas_edgelist(df, dataset.src_ip_col, dataset.dst_ip_col, edge_attr=[dataset.label_col, dataset.class_num_col], create_using=g.nx_type)

In [13]:
for g in graphs_types:
    if g.with_ports:
        class_pairs(df, dataset.src_ip_col, dataset.dst_ip_col, dataset.class_col, results, g.name, folder_path_classes)

In [14]:
for g in graphs_types:
    results[g.name]["is_strongly_connected"] = nx.is_strongly_connected(g.graph)

In [15]:
def degree_centrality_skewness(graph, results_dict, graph_name):
    print("====================")
    print("====================")
    print(graph_name)
    # Compute degree centrality
    degree_centrality = nx.degree_centrality(graph)

    # Extract the values of degree centrality
    degree_values = list(degree_centrality.values())

    # Calculate skewness and kurtosis
    degree_skewness = skew(degree_values)
    degree_kurtosis = kurtosis(degree_values, fisher=True)  # Fisher=True returns excess kurtosis

    print(graph_name, " Skewness of Degree Centrality:", degree_skewness)
    print(graph_name, " Kurtosis of Degree Centrality:", degree_kurtosis)

    results_dict[graph_name]["degree_skewness"] = degree_skewness
    results_dict[graph_name]["degree_kurtosis"] = degree_kurtosis
    # Interpretation:
    # - A high positive skewness indicates a long tail on the right (few nodes with very high centrality).
    # - A high kurtosis indicates heavy tails or a highly peaked distribution.


In [16]:
for g in graphs_types:
    degree_centrality_skewness(g.graph, results, g.name)
    
with open(os.path.join("datasets", name, new_file_name), "w") as f:
    f.writelines(json.dumps(results, cls=NumpyEncoder))

multi_di_graph
multi_di_graph  Skewness of Degree Centrality: 225.01525590885825
multi_di_graph  Kurtosis of Degree Centrality: 57051.08391489221


## Attackers / Victims

In [17]:
def attackers_victims(graph, results_dict, graph_name, label_col):
    print("====================")
    print("====================")
    print(graph_name)
    # Step 1: Identify unique nodes involved in attack and normal traffic
    attackers = set()
    victims = set()

    for u, v, data in graph.edges(data=True):
        if data[label_col] == 1:
            attackers.add(u)
            victims.add(v)

    # Step 2: Count unique attackers and victims
    num_attackers = len(attackers)
    num_victims = len(victims)

    # Step 3: Calculate proportions
    total_nodes = graph.number_of_nodes()
    attacker_proportion = num_attackers / total_nodes if total_nodes > 0 else 0
    victim_proportion = num_victims / total_nodes if total_nodes > 0 else 0

    # print(graph_name, " Unique Attackers:", attackers)
    # print(graph_name, " Unique Victims:", victims)
    print(graph_name, " Number of Attackers:", num_attackers)
    print(graph_name, " Number of Victims:", num_victims)
    print(graph_name, " Proportion of Attackers:", attacker_proportion)
    print(graph_name, " Proportion of Victims:", victim_proportion)

    results_dict[graph_name]["total_nodes"] = total_nodes
    results_dict[graph_name]["Number of Attackers"] = num_attackers
    results_dict[graph_name]["Number of Victims"] = num_victims
    results_dict[graph_name]["Proportion of Attackers"] = attacker_proportion
    results_dict[graph_name]["Proportion of Victims"] = victim_proportion
    results_dict[graph_name]["intersection between attacks and victims"] = len(attackers.intersection(victims))


    # Interpretation:
    # - Attackers: Source nodes of edges labeled as "Attack".
    # - Victims: Target nodes of edges labeled as "Attack".
    # - These metrics provide insight into the roles of nodes in attack scenarios.


In [18]:
for g in graphs_types:
    if g.nx_type == nx.MultiDiGraph:
        attackers_victims(g.graph, results, g.name, dataset.label_col)

with open(os.path.join("datasets", name, new_file_name), "w") as f:
    f.writelines(json.dumps(results, cls=NumpyEncoder))

multi_di_graph
multi_di_graph  Number of Attackers: 724
multi_di_graph  Number of Victims: 1488
multi_di_graph  Proportion of Attackers: 0.00786161814687327
multi_di_graph  Proportion of Victims: 0.016157579837772686


## Graph Metrics Analysis

### Clustering Coefficients

In [19]:
def cal_clustering_coefficients(graph, results_dict, graph_name):
    print("====================")
    print("====================")
    print(graph_name)
    # Clustering Coefficient Distribution Metric
    clustering_coefficients = nx.clustering(nx.Graph(graph))  # Convert MultiDiGraph to Graph for clustering
    clustering_values = list(clustering_coefficients.values())
    mean_clustering = np.mean(clustering_values)
    std_clustering = np.std(clustering_values)

    # print("Clustering Coefficients:", clustering_coefficients)
    print(graph_name, " Mean Clustering Coefficient:", mean_clustering)
    print(graph_name, " Standard Deviation of Clustering Coefficients:", std_clustering)

    results_dict[graph_name]["Mean Clustering Coefficient"] = mean_clustering
    results_dict[graph_name]["Standard Deviation of Clustering Coefficients"] = std_clustering

In [20]:
for g in graphs_types:
    cal_clustering_coefficients(g.graph, results, g.name)

with open(os.path.join("datasets", name, new_file_name), "w") as f:
    f.writelines(json.dumps(results, cls=NumpyEncoder))

multi_di_graph
multi_di_graph  Mean Clustering Coefficient: 9.574336153796037e-05
multi_di_graph  Standard Deviation of Clustering Coefficients: 0.006364805157974324


### Degree Assortativity

In [21]:
def cal_degree_assortativity(graph, results_dict, graph_name):
    print("====================")
    print("====================")
    print(graph_name)
    # Graph Assortativity Metric
    try:
        degree_assortativity = nx.degree_assortativity_coefficient(graph)
        results_dict[graph_name]["Graph Degree Assortativity Coefficient"] = degree_assortativity
        print(graph_name, " Degree Assortativity Coefficient:", degree_assortativity)
    except nx.NetworkXError as e:
        results_dict[graph_name]["Graph Degree Assortativity Coefficient"] = "not applicable"
        print(graph_name, " Error calculating assortativity:", e)
    

In [22]:
for g in graphs_types:
    cal_degree_assortativity(g.graph, results, g.name)

with open(os.path.join("datasets", name, new_file_name), "w") as f:
    f.writelines(json.dumps(results, cls=NumpyEncoder))

multi_di_graph
multi_di_graph  Degree Assortativity Coefficient: -0.13294248169616604


### Graph Diameter

In [23]:
def cal_diameter(graph, results_dict, graph_name):
    print("====================")
    print("====================")
    print(graph_name)
    # Graph Diameter Metric
    try:
        if nx.is_strongly_connected(graph):
            diameter = nx.diameter(graph)
            results_dict[graph_name]["diameter"] = diameter
            print(graph_name, " Graph Diameter multidigraph:", diameter)
        else:
            results_dict[graph_name]["diameter"] = "not applicable"
            print(graph_name, " Graph is not strongly connected, diameter is undefined.")
    
    except nx.NetworkXError as e:
        print("Error calculating diameter:", e)

In [24]:
for g in graphs_types:
    cal_diameter(g.graph, results, g.name)

with open(os.path.join("datasets", name, new_file_name), "w") as f:
    f.writelines(json.dumps(results, cls=NumpyEncoder))

multi_di_graph
multi_di_graph  Graph is not strongly connected, diameter is undefined.


### Path Length Distribution

In [25]:
def path_length_distribution(graph, results_dict, graph_name):
    print("====================")
    print("====================")
    print(graph_name)
    # Path Length Distribution Metric
    try:
        path_lengths = dict(nx.all_pairs_shortest_path_length(graph))
        all_lengths = [length for source in path_lengths.values() for length in source.values()]
        mean_path_length = np.mean(all_lengths)
        std_path_length = np.std(all_lengths)

        print(graph_name, " Mean Path Length MultiDiGraph:", mean_path_length)
        print(graph_name, " Standard Deviation of Path Lengths MultiDiGraph:", std_path_length)
        results_dict[graph_name]["Mean Path Length"] = mean_path_length
        results_dict[graph_name]["Standard Deviation of Path Lengths"] = std_path_length
        
    except nx.NetworkXError as e:
        results_dict[graph_name]["Mean Path Length"] = "not applicable"
        results_dict[graph_name]["Standard Deviation of Path Lengths"] = "not applicable"
        print(graph_name, " Error calculating path length distribution:", e)

    # Interpretation:
    # - Diameter: Longest shortest path in the graph (undefined for disconnected graphs).
    # - Assortativity: Correlation of node degrees (positive, negative, or neutral).
    # - Clustering Coefficients: Measure of local connectivity (distribution provides network structure insights).
    # - Path Lengths: Reachability analysis using shortest paths.

In [26]:
for g in graphs_types:
    if not g.with_ports:
        path_length_distribution(g.graph, results, g.name)

multi_di_graph


## Saving File

In [None]:
with open(os.path.join("datasets", name, new_file_name), "w") as f:
    f.writelines(json.dumps(results, cls=NumpyEncoder))