In [10]:
import pandas as pd
import os
import pickle

import networkx as nx

from src.dataset.dataset_info import datasets

In [11]:
# name = "cic_ton_iot_5_percent"
# name = "cic_ton_iot"
# name = "cic_ids_2017_5_percent"
# name = "cic_ids_2017"
# name = "cic_bot_iot"
# name = "cic_ton_iot_modified"
# name = "nf_ton_iotv2_modified"
# name = "ccd_inid_modified"
# name = "nf_uq_nids_modified"
# name = "edge_iiot"
# name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
name = "nf_uq_nids"
# name = "x_iiot"

dataset = datasets[name]

In [12]:
import networkx as nx
import igraph as ig
import json
import timeit

import time
from functools import wraps


def time_execution(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        # Check if verbose is in kwargs, defaulting to False if not provided
        verbose = kwargs.get("verbose", False)
        if not verbose:
            start_time = timeit.default_timer()
            result = func(*args, **kwargs)
            print(
                f"==>> {func.__name__}: {result}, in {str(timeit.default_timer() - start_time)} seconds")
        else:
            result = func(*args, **kwargs)
        return result
    return wrapper


@time_execution
def number_of_nodes(G, verbose):
    return G.number_of_nodes()


@time_execution
def number_of_edges(G, verbose):
    return G.number_of_edges()


@time_execution
def transitivity(G, verbose):
    return nx.transitivity(G)


@time_execution
def density(G, verbose):
    return nx.density(G)


@time_execution
def mixing_parameter(G, communities, verbose):

    # Step 1: Map each node to its community
    node_to_community = {}
    for community_index, community in enumerate(communities):
        for node in community:
            node_to_community[node] = community_index

    # Step 2: Count inter-cluster edges efficiently
    inter_cluster_edges = 0
    for u, v in G.edges():
        # Directly check if u and v belong to different communities
        if node_to_community[u] != node_to_community[v]:
            inter_cluster_edges += 1

    mixing_parameter = inter_cluster_edges / G.number_of_edges()

    return mixing_parameter


@time_execution
def modularity(G, communities, verbose):

    start_time = timeit.default_timer()
    modularity = nx.community.modularity(G, communities)
    if verbose:
        print(
            f"==>> modularity: {modularity}, in {str(timeit.default_timer() - start_time)} seconds")

    return modularity


def get_degrees(G, verbose):
    start_time = timeit.default_timer()
    degrees = [degree for _, degree in G.degree()]
    if verbose:
        print(
            f"==>> calculated degrees, in {str(timeit.default_timer() - start_time)} seconds")
    return degrees


def find_communities(G, verbose):

    start_time = timeit.default_timer()
    G1 = ig.Graph.from_networkx(G)

    part = G1.community_infomap()
    # part = G1.community_multilevel()
    # part = G1.community_spinglass()
    # part = G1.community_edge_betweenness()

    communities = []
    for com in part:
        communities.append([G1.vs[node_index]['_nx_name']
                           for node_index in com])

    # communities = nx.community.louvain_communities(G)
    if verbose:
        print(
            f"==>> number_of_communities: {len(communities)}, in {str(timeit.default_timer() - start_time)} seconds")

    return communities


def calculate_graph_measures(G, file_path=None, verbose=False):

    properties = {}

    properties["number_of_nodes"] = number_of_nodes(G, verbose)
    properties["number_of_edges"] = number_of_edges(G, verbose)

    degrees = get_degrees(G, verbose)

    properties["max_degree"] = max(degrees)
    properties["avg_degree"] = sum(degrees) / len(degrees)

    if type(G) == nx.DiGraph or type(G) == nx.Graph:
        properties["transitivity"] = transitivity(G, verbose)

    properties["density"] = density(G, verbose)

    communities = find_communities(G, verbose)

    properties["number_of_communities"] = len(communities)
    properties["mixing_parameter"] = mixing_parameter(G, communities, verbose)
    properties["modularity"] = modularity(G, communities, verbose)

    if file_path:
        outfile = open(file_path, 'w')
        outfile.writelines(json.dumps(properties))
        outfile.close()

    return properties


In [13]:
# with open("datasets/" + name + "/training_graph.pkl", "rb") as f:
#     G = pickle.load(f)
df = pd.read_parquet(dataset.path)
G = nx.from_pandas_edgelist(df, dataset.src_ip_col, dataset.dst_ip_col, create_using=nx.MultiDiGraph())


# get netowrk properties
graph_measures = calculate_graph_measures(G, "datasets/" + name + "/multiDiGraph_graph_measures.json", verbose=True)
print(f"==>> graph_measures: {graph_measures}")

graph_measures = calculate_graph_measures(nx.DiGraph(G), "datasets/" + name + "/digraph_graph_measures.json", verbose=True)
print(f"==>> graph_measures: {graph_measures}")

==>> number_of_nodes: 93645, in 6.299465894699097e-06 seconds
==>> number_of_edges: 10729039, in 0.429551899433136 seconds
==>> calculated degrees, in 0.6344597004354 seconds
==>> density: 0.001223478307607757, in 0.9582200013101101 seconds


In [None]:
df[dataset.src_port_col] = df[dataset.src_port_col].astype(float).astype(int).astype(str) # to remove the decimal point
df[dataset.src_ip_col] = df[dataset.src_ip_col] + ':' + df[dataset.src_port_col]

df[dataset.dst_port_col] = df[dataset.dst_port_col].astype(float).astype(int).astype(str) # to remove the decimal point
df[dataset.dst_ip_col] = df[dataset.dst_ip_col] + ':' + df[dataset.dst_port_col]

G = nx.from_pandas_edgelist(df, dataset.src_ip_col, dataset.dst_ip_col, create_using=nx.MultiDiGraph())


# get netowrk properties
graph_measures = calculate_graph_measures(G, "datasets/" + name + "/ports_multiDiGraph_graph_measures.json", verbose=True)
print(f"==>> graph_measures: {graph_measures}")

graph_measures = calculate_graph_measures(nx.DiGraph(G), "datasets/" + name + "/ports_digraph_graph_measures.json", verbose=True)
print(f"==>> graph_measures: {graph_measures}")

==>> number_of_nodes: 268120, in 5.599111318588257e-06 seconds
==>> number_of_edges: 631814, in 0.8301995992660522 seconds
==>> calculated degrees, in 0.9263340011239052 seconds
==>> density: 8.788857910562231e-06, in 0.5778869986534119 seconds
==>> number_of_communities: 42, in 830.590715598315 seconds
==>> mixing_parameter: 0.0, in 1.8683775998651981 seconds
==>> modularity: 0.25005780477671435, in 5.020574200898409 seconds
==>> modularity: 0.25005780477671435, in 5.0207654014229774 seconds
==>> graph_measures: {'number_of_nodes': 268120, 'number_of_edges': 631814, 'max_degree': 147700, 'avg_degree': 4.71291958824407, 'density': 8.788857910562231e-06, 'number_of_communities': 42, 'mixing_parameter': 0.0, 'modularity': 0.25005780477671435}
==>> number_of_nodes: 268120, in 9.3020498752594e-06 seconds
==>> number_of_edges: 364616, in 0.5279907993972301 seconds
==>> calculated degrees, in 0.1259106993675232 seconds
==>> transitivity: 9.713003170921958e-07, in 1.8788352981209755 seconds
=

In [None]:
# with open("datasets/" + name + "/training_graph.pkl", "rb") as f:
#     G = pickle.load(f)


# # get netowrk properties
# graph_measures = calculate_graph_measures(G, "datasets/" + name + "/training_graph_measures.json", verbose=True)
# print(f"==>> graph_measures: {graph_measures}")

# graph_measures = calculate_graph_measures(nx.DiGraph(G), "datasets/" + name + "/training_graph_simple_measures.json", verbose=True)
# print(f"==>> graph_measures: {graph_measures}")

In [None]:
# with open("datasets/" + name + "/testing_graph.pkl", "rb") as f:
#     G_test = pickle.load(f)

# graph_measures = calculate_graph_measures(G_test, "datasets/" + name + "/testing_graph_measures.json", verbose=True)
# print(f"==>> graph_measures: {graph_measures}")

# graph_measures = calculate_graph_measures(nx.DiGraph(G_test), "datasets/" + name + "/testing_graph_simple_measures.json", verbose=True)
# print(f"==>> graph_measures: {graph_measures}")

In [None]:
# import igraph as ig
# G1 = ig.Graph.from_networkx(G)

# part = G1.community_infomap()

In [None]:
# import networkx as nx

# import timeit

# verbose = True
# properties = {}

# start_time = timeit.default_timer()

# # part = G1.community_multilevel()
# # part = G1.community_spinglass()
# # part = G1.community_edge_betweenness()

# communities = []
# for com in part:
#     communities.append([G1.vs[node_index]['_nx_name']
#                         for node_index in com])

# # communities = nx.community.louvain_communities(G)
# number_of_communities = len(communities)
# if verbose:
#     print(
#         f"==>> number_of_communities: {number_of_communities}, in {str(timeit.default_timer() - start_time)} seconds")
# properties["number_of_communities"] = number_of_communities

# # Step 1: Map each node to its community
# node_to_community = {}
# for community_index, community in enumerate(communities):
#     for node in community:
#         node_to_community[node] = community_index

# # Step 2: Count inter-cluster edges efficiently
# inter_cluster_edges = 0
# for u, v in G.edges():
#     # Directly check if u and v belong to different communities
#     if node_to_community[u] != node_to_community[v]:
#         inter_cluster_edges += 1

# start_time = timeit.default_timer()
# mixing_parameter = inter_cluster_edges / G.number_of_edges()
# if verbose:
#     print(
#         f"==>> mixing_parameter: {mixing_parameter}, in {str(timeit.default_timer() - start_time)} seconds")
# properties["mixing_parameter"] = mixing_parameter

# start_time = timeit.default_timer()
# modularity = nx.community.modularity(G, communities)
# if verbose:
#     print(
#         f"==>> modularity: {modularity}, in {str(timeit.default_timer() - start_time)} seconds")
# properties["modularity"] = modularity