In [None]:
import os
import json

import networkx as nx
import numpy as np
import pandas as pd
from scipy.stats import entropy, kurtosis, skew

from src.dataset.dataset_info import datasets
from src.numpy_encoder import NumpyEncoder


In [None]:
# name = "cic_ton_iot_5_percent"
# name = "cic_ton_iot"
name = "cic_ids_2017_5_percent"
# name = "cic_ids_2017"
# name = "cic_bot_iot"
# name = "cic_ton_iot_modified"
# name = "nf_ton_iotv2_modified"
# name = "ccd_inid_modified"
# name = "nf_uq_nids_modified"
# name = "edge_iiot"
# name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
# name = "nf_uq_nids"
# name = "x_iiot"

dataset = datasets[name]

In [None]:
df = pd.read_parquet(dataset.path)

In [None]:
results = {}

data_s = "dataset"
multi_g = "multi_di_graph"
multi_g_with_ports = "multi_di_graph_with_ports"
di_g = "di_graph"
di_g_with_ports = "di_graph_with_ports"

results["name"] = name
results[data_s] = {}
results[multi_g] = {}
results[multi_g_with_ports] = {}
results[di_g] = {}
results[di_g_with_ports] = {}

## Classes Gini Coefficient

In [None]:
# Function to calculate the Gini coefficient
def gini_coefficient(values):
    sorted_values = np.sort(values)
    n = len(sorted_values)
    cumulative_values = np.cumsum(sorted_values)
    gini = (n + 1 - 2 * np.sum(cumulative_values) / np.sum(sorted_values)) / n
    return gini

# Multi-class Gini Coefficient
class_counts = df[dataset.class_num_col].value_counts()
class_proportions = class_counts.values / class_counts.values.sum()
multi_class_gini = gini_coefficient(class_proportions)

print("Class Counts:")
print(class_counts)
print("Class Proportions:", class_proportions)
print("Multi-class Gini Coefficient:", multi_class_gini)

# Binary Classification Gini Coefficient
label_counts = df[dataset.label_col].value_counts()
label_proportions = label_counts.values / label_counts.values.sum()
binary_gini = gini_coefficient(label_proportions)

print("Label Counts:")
print(label_counts)
print("Label Proportions:", label_proportions)
print("Binary Classification Gini Coefficient:", binary_gini)


results[data_s]["Multi-class Gini Coefficient"] = multi_class_gini
results[data_s]["Binary Classification Gini Coefficient"] = binary_gini

total_count = len(df)
results[data_s]["length"] = total_count
num_benign = len(df[df[dataset.label_col] == 0])
num_attack = len(df[df[dataset.label_col] == 1])

results[data_s]["num_benign"] = num_benign
results[data_s]["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

results[data_s]["num_attack"] = num_attack
results[data_s]["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

results[data_s]["attacks"] = list(df[dataset.class_col].unique())

# Interpretation:
# - A Gini coefficient closer to 0 indicates balanced distribution.
# - A Gini coefficient closer to 1 indicates imbalanced distribution.


In [None]:
def class_pairs(df_cs, source_ip, destination_ip, class_column, results_dict, graph_name, folder_path):
    print("====================")
    print("====================")
    print(graph_name)
    
    os.makedirs(folder_path, exist_ok=True)

    # Initialize lists to store results
    same_class_pairs = {}
    mixed_class_pairs = []

    # Group by source and destination IP addresses
    for (source, destination), group in df_cs.groupby([source_ip, destination_ip]):
        unique_classes = group[class_column].unique()
        if len(unique_classes) == 1:
            # All records have the same class
            class_label = str(unique_classes[0])
            if class_label not in same_class_pairs:
                same_class_pairs[class_label] = []
            same_class_pairs[class_label].append({
                'node_pair': (source, destination),
                'num_instances': len(group)
            })
        else:
            # Mixed class scenario
            class_counts = group[class_column].value_counts().to_dict()
            total_instances = len(group)
            class_percentages = {str(cls): count / total_instances for cls, count in class_counts.items()}
            mixed_class_pairs.append({
                'node_pair': (source, destination),
                'class_counts': class_counts,
                'class_percentages': class_percentages
            })


    # Output results
    # print("Node pairs with the same class:")
    # for class_label, pairs in same_class_pairs.items():
    #     print(f"Class {class_label}: {pairs}")

    # print("\nNode pairs with mixed classes:")
    # for mixed_pair in mixed_class_pairs:
    #     print(mixed_pair)
    with open(os.path.join(folder_path, f"{graph_name}_same_class_pairs.json"), "w") as f:
        f.writelines(json.dumps(same_class_pairs, cls=NumpyEncoder))
        
    with open(os.path.join(folder_path, f"{graph_name}_mixed_class_pairs.json"), "w") as f:
        f.writelines(json.dumps(mixed_class_pairs, cls=NumpyEncoder))

    # Total counts
    total_same_class_pairs = sum(len(pairs) for pairs in same_class_pairs.values())
    total_mixed_class_pairs = len(mixed_class_pairs)

    print("\nTotal number of same class pairs:", total_same_class_pairs)
    print("Total number of mixed class pairs:", total_mixed_class_pairs)
    
    results_dict[graph_name]["total_same_class_pairs"] = total_same_class_pairs
    results_dict[graph_name]["total_mixed_class_pairs"] = total_mixed_class_pairs

    # Interpretation:
    # - `same_class_pairs` contains node pairs with consistent classes across all records, including the number of instances.
    # - `mixed_class_pairs` contains node pairs with mixed classes, the counts and percentages for each class.
    # - Total counts provide an overview of the dataset's class consistency.


In [None]:
folder_path_classes = os.path.join("datasets", name, "class_pairs")
class_pairs(df, dataset.src_ip_col, dataset.dst_ip_col, dataset.class_col, results, multi_g, folder_path_classes)
class_pairs(df, dataset.src_ip_col, dataset.dst_ip_col, dataset.label_col, results, di_g, folder_path_classes)

## Centrality Skewness

In [None]:
G = nx.from_pandas_edgelist(df, dataset.src_ip_col, dataset.dst_ip_col, edge_attr=[dataset.label_col, dataset.class_num_col], create_using=nx.MultiDiGraph())
simple_digraph = nx.DiGraph(G)

In [None]:
df[dataset.src_port_col] = df[dataset.src_port_col].astype(float).astype(int).astype(str) # to remove the decimal point
df[dataset.src_ip_col] = df[dataset.src_ip_col] + ':' + df[dataset.src_port_col]

df[dataset.dst_port_col] = df[dataset.dst_port_col].astype(float).astype(int).astype(str) # to remove the decimal point
df[dataset.dst_ip_col] = df[dataset.dst_ip_col] + ':' + df[dataset.dst_port_col]

G_with_ports = nx.from_pandas_edgelist(df, dataset.src_ip_col, dataset.dst_ip_col, edge_attr=[dataset.label_col, dataset.class_num_col], create_using=nx.MultiDiGraph())
simple_digraph_with_ports = nx.DiGraph(G_with_ports)

In [None]:
class_pairs(df, dataset.src_ip_col, dataset.dst_ip_col, dataset.class_col, results, multi_g_with_ports, folder_path_classes)
class_pairs(df, dataset.src_ip_col, dataset.dst_ip_col, dataset.label_col, results, di_g_with_ports, folder_path_classes)

In [None]:
results[multi_g]["is_strongly_connected"] = nx.is_strongly_connected(G)
results[multi_g_with_ports]["is_strongly_connected"] = nx.is_strongly_connected(G_with_ports)
results[di_g]["is_strongly_connected"] = nx.is_strongly_connected(simple_digraph)
results[di_g_with_ports]["is_strongly_connected"] = nx.is_strongly_connected(simple_digraph_with_ports)

In [None]:
def degree_centrality_skewness(graph, results_dict, graph_name):
    print("====================")
    print("====================")
    print(graph_name)
    # Compute degree centrality
    degree_centrality = nx.degree_centrality(graph)

    # Extract the values of degree centrality
    degree_values = list(degree_centrality.values())

    # Calculate skewness and kurtosis
    degree_skewness = skew(degree_values)
    degree_kurtosis = kurtosis(degree_values, fisher=True)  # Fisher=True returns excess kurtosis

    print(graph_name, " Skewness of Degree Centrality:", degree_skewness)
    print(graph_name, " Kurtosis of Degree Centrality:", degree_kurtosis)

    results_dict[graph_name]["degree_skewness"] = degree_skewness
    results_dict[graph_name]["degree_kurtosis"] = degree_kurtosis
    # Interpretation:
    # - A high positive skewness indicates a long tail on the right (few nodes with very high centrality).
    # - A high kurtosis indicates heavy tails or a highly peaked distribution.


In [None]:
degree_centrality_skewness(G, results, multi_g)
degree_centrality_skewness(simple_digraph, results, di_g)
degree_centrality_skewness(G_with_ports, results, multi_g_with_ports)
degree_centrality_skewness(simple_digraph_with_ports, results, di_g_with_ports)

with open(os.path.join("datasets", name, "df_properties_new.json"), "w") as f:
    f.writelines(json.dumps(results, cls=NumpyEncoder))

## Attackers / Victims

In [None]:
def attackers_victims(graph, results_dict, graph_name, label_col):
    print("====================")
    print("====================")
    print(graph_name)
    # Step 1: Identify unique nodes involved in attack and normal traffic
    attackers = set()
    victims = set()

    for u, v, data in graph.edges(data=True):
        if data[label_col] == 1:
            attackers.add(u)
            victims.add(v)

    # Step 2: Count unique attackers and victims
    num_attackers = len(attackers)
    num_victims = len(victims)

    # Step 3: Calculate proportions
    total_nodes = graph.number_of_nodes()
    attacker_proportion = num_attackers / total_nodes if total_nodes > 0 else 0
    victim_proportion = num_victims / total_nodes if total_nodes > 0 else 0

    # print(graph_name, " Unique Attackers:", attackers)
    # print(graph_name, " Unique Victims:", victims)
    print(graph_name, " Number of Attackers:", num_attackers)
    print(graph_name, " Number of Victims:", num_victims)
    print(graph_name, " Proportion of Attackers:", attacker_proportion)
    print(graph_name, " Proportion of Victims:", victim_proportion)

    results_dict[graph_name]["total_nodes"] = total_nodes
    results_dict[graph_name]["Number of Attackers"] = num_attackers
    results_dict[graph_name]["Number of Victims"] = num_victims
    results_dict[graph_name]["Proportion of Attackers"] = attacker_proportion
    results_dict[graph_name]["Proportion of Victims"] = victim_proportion
    results_dict[graph_name]["intersection between attacks and victims"] = len(attackers.intersection(victims))


    # Interpretation:
    # - Attackers: Source nodes of edges labeled as "Attack".
    # - Victims: Target nodes of edges labeled as "Attack".
    # - These metrics provide insight into the roles of nodes in attack scenarios.


In [None]:
attackers_victims(G, results, multi_g, dataset.label_col)
# attackers_victims(simple_digraph, results, di_g, dataset.label_col)
attackers_victims(G_with_ports, results, multi_g_with_ports, dataset.label_col)
# attackers_victims(simple_digraph_with_ports, results, di_g_with_ports, dataset.label_col)

with open(os.path.join("datasets", name, "df_properties_new.json"), "w") as f:
    f.writelines(json.dumps(results, cls=NumpyEncoder))

## Graph Metrics Analysis

### Clustering Coefficients

In [None]:
def cal_clustering_coefficients(graph, results_dict, graph_name):
    print("====================")
    print("====================")
    print(graph_name)
    # Clustering Coefficient Distribution Metric
    clustering_coefficients = nx.clustering(nx.Graph(graph))  # Convert MultiDiGraph to Graph for clustering
    clustering_values = list(clustering_coefficients.values())
    mean_clustering = np.mean(clustering_values)
    std_clustering = np.std(clustering_values)

    # print("Clustering Coefficients:", clustering_coefficients)
    print(graph_name, " Mean Clustering Coefficient:", mean_clustering)
    print(graph_name, " Standard Deviation of Clustering Coefficients:", std_clustering)

    results_dict[graph_name]["Mean Clustering Coefficient"] = mean_clustering
    results_dict[graph_name]["Standard Deviation of Clustering Coefficients"] = std_clustering

In [None]:
cal_clustering_coefficients(G, results, multi_g)
cal_clustering_coefficients(simple_digraph, results, di_g)
cal_clustering_coefficients(G_with_ports, results, multi_g_with_ports)
cal_clustering_coefficients(simple_digraph_with_ports, results, di_g_with_ports)

with open(os.path.join("datasets", name, "df_properties_new.json"), "w") as f:
    f.writelines(json.dumps(results, cls=NumpyEncoder))

### Degree Assortativity

In [None]:
def cal_degree_assortativity(graph, results_dict, graph_name):
    print("====================")
    print("====================")
    print(graph_name)
    # Graph Assortativity Metric
    try:
        degree_assortativity = nx.degree_assortativity_coefficient(graph)
        results_dict[graph_name]["Graph Degree Assortativity Coefficient"] = degree_assortativity
        print(graph_name, " Degree Assortativity Coefficient:", degree_assortativity)
    except nx.NetworkXError as e:
        results_dict[graph_name]["Graph Degree Assortativity Coefficient"] = "not applicable"
        print(graph_name, " Error calculating assortativity:", e)
    

In [None]:
cal_degree_assortativity(G, results, multi_g)
cal_degree_assortativity(simple_digraph, results, di_g)
cal_degree_assortativity(G_with_ports, results, multi_g_with_ports)
cal_degree_assortativity(simple_digraph_with_ports, results, di_g_with_ports)

with open(os.path.join("datasets", name, "df_properties_new.json"), "w") as f:
    f.writelines(json.dumps(results, cls=NumpyEncoder))

### Graph Diameter

In [None]:
def cal_diameter(graph, results_dict, graph_name):
    print("====================")
    print("====================")
    print(graph_name)
    # Graph Diameter Metric
    try:
        if nx.is_strongly_connected(graph):
            diameter = nx.diameter(graph)
            results_dict[graph_name]["diameter"] = diameter
            print(graph_name, " Graph Diameter multidigraph:", diameter)
        else:
            results_dict[graph_name]["diameter"] = "not applicable"
            print(graph_name, " Graph is not strongly connected, diameter is undefined.")
    
    except nx.NetworkXError as e:
        print("Error calculating diameter:", e)

In [None]:
cal_diameter(G, results, multi_g)
cal_diameter(simple_digraph, results, di_g)
cal_diameter(G_with_ports, results, multi_g_with_ports)
cal_diameter(simple_digraph_with_ports, results, di_g_with_ports)

with open(os.path.join("datasets", name, "df_properties_new.json"), "w") as f:
    f.writelines(json.dumps(results, cls=NumpyEncoder))

### Path Length Distribution

In [None]:
def path_length_distribution(graph, results_dict, graph_name):
    print("====================")
    print("====================")
    print(graph_name)
    # Path Length Distribution Metric
    try:
        path_lengths = dict(nx.all_pairs_shortest_path_length(graph))
        all_lengths = [length for source in path_lengths.values() for length in source.values()]
        mean_path_length = np.mean(all_lengths)
        std_path_length = np.std(all_lengths)

        print(graph_name, " Mean Path Length MultiDiGraph:", mean_path_length)
        print(graph_name, " Standard Deviation of Path Lengths MultiDiGraph:", std_path_length)
        results_dict[graph_name]["Mean Path Length"] = mean_path_length
        results_dict[graph_name]["Standard Deviation of Path Lengths"] = std_path_length
        
    except nx.NetworkXError as e:
        results_dict[graph_name]["Mean Path Length"] = "not applicable"
        results_dict[graph_name]["Standard Deviation of Path Lengths"] = "not applicable"
        print(graph_name, " Error calculating path length distribution:", e)

    # Interpretation:
    # - Diameter: Longest shortest path in the graph (undefined for disconnected graphs).
    # - Assortativity: Correlation of node degrees (positive, negative, or neutral).
    # - Clustering Coefficients: Measure of local connectivity (distribution provides network structure insights).
    # - Path Lengths: Reachability analysis using shortest paths.

In [None]:
path_length_distribution(G, results, multi_g)
path_length_distribution(simple_digraph, results, di_g)
# path_length_distribution(G_with_ports, results, multi_g_with_ports)
# path_length_distribution(simple_digraph_with_ports, results, di_g_with_ports)

## Saving File

In [None]:
with open(os.path.join("datasets", name, "df_properties_new.json"), "w") as f:
    f.writelines(json.dumps(results, cls=NumpyEncoder))