In [1]:
import os
import pickle
import random
import socket
import struct

import networkx as nx
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from scipy.sparse import csr_matrix

from src.dataset.dataset_info import datasets
from src.graph.graph_construction import create_weightless_window_graph
from src.graph.graph_measures import calculate_graph_measures
from src.graph.centralities import add_centralities, add_centralities_as_node_features
from local_variables import local_datasets_path



In [2]:
multi_class = True

use_node_features = False

use_port_in_address = False

generated_ips = False

# graph_type = "flow"
graph_type = "window"
# graph_type = "line"

window_size= 500

# sort_timestamp = False
sort_timestamp = True

# k_fold = None
# k_fold = 5

validation_size = 0.1
test_size = 0.1

cn_measures = ["betweenness", "degree", "pagerank", "closeness", "k_truss"]
# cn_measures = ["betweenness", "degree", "closeness"]

network_features = ['src_betweenness', 'dst_betweenness', 'src_degree', 'dst_degree', 'src_pagerank', 'dst_pagerank', 'src_closeness', 'dst_closeness', 'src_k_truss', 'dst_k_truss']
# network_features = ['src_betweenness', 'dst_betweenness', 'src_degree', 'dst_degree', 'src_pagerank', 'dst_pagerank']

In [3]:
# name = "cic_ton_iot_5_percent"
# name = "cic_ton_iot"
# name = "cic_ids_2017_5_percent"
name = "cic_ids_2017"
# name = "cic_bot_iot"
# name = "cic_ton_iot_modified"
# name = "nf_ton_iotv2_modified"
# name = "ccd_inid_modified"
# name = "nf_uq_nids_modified"
# name = "edge_iiot"
# name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
# name = "nf_uq_nids"
# name = "x_iiot"

dataset = datasets[name]

In [4]:
g_type = ""
if graph_type == "flow":
    g_type = "flow"
elif graph_type == "line":
    g_type = f"line_graph_{window_size}"
elif graph_type == "window":
    g_type = f"window_graph_{window_size}"
    
if multi_class:
    g_type += "__multi_class"
    
if use_node_features:
    g_type += "__n_feats"
    
# if k_fold:
#     g_type += f"__{k_fold}_fold"
    
if use_port_in_address:
    g_type += "__ports"
    
if generated_ips:
    g_type += "__generated_ips"
    
if sort_timestamp:
    g_type += "__sorted"
else:
    g_type += "__unsorted"
    
dataset_path = os.path.join(local_datasets_path,name)
folder_path = os.path.join(dataset_path, g_type)
# folder_path = f"datasets/{name}/{g_type}"
folder_path

'C:\\Users\\Administrateur\\Desktop\\datasets\\cic_ids_2017\\flow__multi_class__unsorted'

In [5]:
df = pd.read_parquet(os.path.join(dataset_path, f"{name}.parquet"))

In [6]:
df.head()

Unnamed: 0_level_0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Attack,Class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,192.168.10.5-104.16.207.165-54865-443-6,104.16.207.165,443.0,192.168.10.5,54865.0,6.0,7/7/2017 3:30,3.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0
1,192.168.10.5-104.16.28.216-55054-80-6,104.16.28.216,80.0,192.168.10.5,55054.0,6.0,7/7/2017 3:30,109.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0
2,192.168.10.5-104.16.28.216-55055-80-6,104.16.28.216,80.0,192.168.10.5,55055.0,6.0,7/7/2017 3:30,52.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0
3,192.168.10.16-104.17.241.25-46236-443-6,104.17.241.25,443.0,192.168.10.16,46236.0,6.0,7/7/2017 3:30,34.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0
4,192.168.10.5-104.19.196.102-54863-443-6,104.19.196.102,443.0,192.168.10.5,54863.0,6.0,7/7/2017 3:30,3.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0


In [7]:
cols_to_norm = list(set(list(df.columns))  - set(list([dataset.label_col, dataset.class_num_col])) - set(dataset.drop_columns)  - set(dataset.weak_columns))

In [8]:
df[dataset.label_col].value_counts()

Label
0    2265910
1     548968
Name: count, dtype: int64

In [9]:
dataset.timestamp_format

'mixed'

In [10]:
df[dataset.timestamp_col].dtypes

dtype('O')

In [12]:
def method_grouping_time_generator(df, time_window_seconds=60, timestamp_col="timestamp", src_ip_col = "src_ip"):
    """
    Create a flow graph where each node represents a flow (a row in df) and an edge is added
    between two flows if:
      1. They share the same source IP address.
      2. Their timestamps are within time_window_seconds.
    
    It converts the "timestamp" column to datetime and uses a generator to stream
    edge creation in order to save memory.
    
    Parameters:
      df (pd.DataFrame): Flow dataset.
      time_window_seconds (int): Maximum allowed time difference between flows (in seconds).
    
    Returns:
      G (networkx.Graph): Graph with nodes representing flows and edges connecting flows
                          with matching src_ip that occur within the given time window.
    """
    
    # Convert the timestamp column to datetime if not already
    if not pd.api.types.is_datetime64_any_dtype(df[timestamp_col]):
        df[timestamp_col] = pd.to_datetime(df[timestamp_col], format='mixed')
    
    # Create an empty graph and add each flow as a node with its attributes.
    G = nx.Graph()
    for idx, row in df.iterrows():
        G.add_node(idx, **row.to_dict())
    
    def generate_edges_for_group(indices, times, window):
        """
        Generator that yields an edge for each pair of flows in the same group
        where the time difference is within the window.
        
        Parameters:
          indices: numpy array of original DataFrame indices for the group.
          times: numpy array of the corresponding timestamps (in seconds).
          window: maximum allowed time difference (in seconds).
        """
        n = len(times)
        for i in range(n):
            # For the i-th flow, find the right boundary index where the timestamp
            # is greater than times[i] + window.
            right_bound = np.searchsorted(times, times[i] + window, side='right')
            # Yield an edge from the current node to all nodes between i+1 and right_bound.
            for j in range(i + 1, right_bound):
                yield (indices[i], indices[j])
    
    # Group the DataFrame by source IP and process each group.
    for src_ip, group in df.groupby(src_ip_col):
        # Sort flows by timestamp.
        group_sorted = group.sort_values(timestamp_col)
        indices = group_sorted.index.to_numpy()
        # Convert timestamps to seconds since epoch (as int64 division)
        times = (group_sorted[timestamp_col].astype(np.int64) // 10**9).to_numpy()
        
        # Add edges generated by the generator function.
        G.add_edges_from(generate_edges_for_group(indices, times, time_window_seconds))
    
    return G



G_time = method_grouping_time_generator(df, time_window_seconds=0, timestamp_col=dataset.timestamp_col, src_ip_col=dataset.src_ip_col)
print("Graph created by the grouping method:")
print("Nodes:", G_time.number_of_nodes())
print("Edges:", G_time.number_of_edges())

MemoryError: 

In [12]:
import pandas as pd
import networkx as nx

def method_grouping_time(df, time_window_seconds=60):
    """
    Create a flow graph where each node represents a flow (a row in df) and an edge is added
    between two flows if:
      1. They share the same source IP address.
      2. Their timestamps are within `time_window_hours` hours.
    
    Assumes the DataFrame `df` has at least the following columns:
      - "src_ip": source IP address for the flow.
      - "timestamp": a column of type object representing timestamps.
    
    This function will convert the "timestamp" column to datetime.
    
    Parameters:
      df (pd.DataFrame): Flow dataset with "src_ip" and "timestamp" columns.
      time_window_hours (float): Maximum time difference (in hours) between two flows 
                                 for an edge to be added.
    
    Returns:
      G (networkx.Graph): Graph where each node represents a flow (indexed by DataFrame row index)
                          and an edge exists if both flows share the same src_ip and their 
                          timestamp difference is within the specified time window.
    """
    # Convert the timestamp column to datetime if not already
    if not pd.api.types.is_datetime64_any_dtype(df[dataset.timestamp_col]):
        df[dataset.timestamp_col] = pd.to_datetime(df[dataset.timestamp_col], format=dataset.timestamp_format)
    
    # Create an empty graph
    G = nx.Graph()
    
    # Add every flow as a node with its attributes.
    for idx, row in df.iterrows():
        G.add_node(idx, **row.to_dict())
    
    
    # Group the flows by 'src_ip'
    for src_ip, group in df.groupby(dataset.src_ip_col):
        # Sort the group by timestamp.
        group_sorted = group.sort_values(dataset.timestamp_col)
        # Get the list of original indices and their corresponding timestamps.
        indices = group_sorted.index.tolist()
        timestamps = group_sorted[dataset.timestamp_col].tolist()
        
        # For each flow, add edges to subsequent flows in the sorted group while their 
        # difference in time is within the allowed window.
        n = len(timestamps)
        for i in range(n):
            j = i + 1
            while j < n and (timestamps[j] - timestamps[i]).total_seconds() <= time_window_seconds:
                G.add_edge(indices[i], indices[j])
                j += 1
                
    return G

G_time = method_grouping_time(df, time_window_seconds=1)
print("Graph created by the grouping method:")
print("Nodes:", G_time.number_of_nodes())
print("Edges:", G_time.number_of_edges())

Graph created by the grouping method:
Nodes: 692543
Edges: 0


In [25]:
def method_grouping(df):
    """
    Create a flow graph by grouping flows based on their source IP.
    
    For each source IP, every pair of flows is connected with an edge.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing flow data with at least a "src_ip" column.
        
    Returns:
        G (networkx.Graph): Graph where each node is a flow (row index) and
                            an edge exists if two flows share the same source IP.
    """
    G = nx.Graph()
    # Add each flow as a node with its attributes.
    for idx, row in df.iterrows():
        G.add_node(idx, **row.to_dict())
    
    # Group the flows by 'src_ip' for fast lookup of flows with the same source.
    grouped = df.groupby(dataset.src_ip_col).indices
    
    # For each source IP group, add an edge between every pair of flows.
    for src, indices in grouped.items():
        if len(indices) < 2:
            continue  # No edge to add if only one flow for this src_ip.
        for i in range(len(indices)):
            for j in range(i + 1, len(indices)):
                G.add_edge(indices[i], indices[j])
    
    return G


def method_bipartite(df):
    """
    Create a flow graph using a bipartite approach and then projecting onto flows.
    
    The bipartite graph has:
        - Flow nodes (each flow with its attributes)
        - Source IP nodes
    An edge connects a flow node to its source IP node.
    
    Then, the bipartite graph is projected onto the flow nodes. That is,
    two flow nodes share an edge if they both connect to the same source IP.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing flow data with at least a "src_ip" column.
        
    Returns:
        G (networkx.Graph): Flow graph where nodes (original flow indices) are connected if
                            the flows share the same source IP.
    """
    B = nx.Graph()
    # Add flow nodes: prefix the node id to avoid conflict with source IPs.
    for idx, row in df.iterrows():
        B.add_node(f'flow_{idx}', bipartite=0, **row.to_dict())
    # Add source IP nodes.
    src_ips = df[dataset.src_ip_col].unique()
    for src in src_ips:
        B.add_node(src, bipartite=1)
    
    # Create bipartite edges from each flow to its corresponding source IP.
    for idx, row in df.iterrows():
        B.add_edge(f'flow_{idx}', row[dataset.src_ip_col])
    
    # Project the bipartite graph onto flow nodes.
    flow_nodes = [n for n, d in B.nodes(data=True) if d.get('bipartite') == 0]
    G = nx.algorithms.bipartite.projected_graph(B, flow_nodes)
    
    # (Optional) Relabel nodes to recover the original integer indices.
    mapping = {node: int(node.replace('flow_', '')) for node in flow_nodes}
    G = nx.relabel_nodes(G, mapping)
    
    return G

# Method 1: Grouping Method
G_grouping = method_grouping(df)
print("Graph created by the grouping method:")
print("Nodes:", G_grouping.number_of_nodes())
print("Edges:", G_grouping.number_of_edges())

# Method 2: Bipartite Projection Method
G_bipartite = method_bipartite(df)
print("\nGraph created by the bipartite method:")
print("Nodes:", G_bipartite.number_of_nodes())
print("Edges:", G_bipartite.number_of_edges())


In [53]:
if multi_class:
    y = df[dataset.class_num_col]
else:
    y = df[dataset.label_col]

if sort_timestamp:
    X_tr, X_test, y_tr, y_test = train_test_split(
        df, y, test_size=test_size)
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_tr, y_tr, test_size=validation_size)
else:
    X_tr, X_test, y_tr, y_test = train_test_split(
        df, y, test_size=test_size, random_state=13, stratify=y)
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_tr, y_tr, test_size=validation_size, random_state=13, stratify=y_tr)

del df

In [54]:
if graph_type == "line" and use_node_features:
    add_centralities(df = X_train, new_path=None, graph_path=None, dataset=dataset, cn_measures=cn_measures, network_features=network_features, create_using=nx.MultiDiGraph())
    add_centralities(df = X_val, new_path=None, graph_path=None, dataset=dataset, cn_measures=cn_measures, network_features=network_features, create_using=nx.MultiDiGraph())
    add_centralities(df = X_test, new_path=None, graph_path=None, dataset=dataset, cn_measures=cn_measures, network_features=network_features, create_using=nx.MultiDiGraph())
    cols_to_norm = list(set(cols_to_norm) | set(network_features))
    

In [55]:
scaler = StandardScaler()

X_train[cols_to_norm] = scaler.fit_transform(X_train[cols_to_norm])
X_train['h'] = X_train[ cols_to_norm ].values.tolist()

cols_to_drop = list(set(list(X_train.columns)) - set(list([dataset.label_col, dataset.src_ip_col, dataset.dst_ip_col, dataset.class_num_col, 'h'])))
X_train.drop(cols_to_drop, axis=1, inplace=True)

X_val[cols_to_norm] = scaler.transform(X_val[cols_to_norm])
X_val['h'] = X_val[ cols_to_norm ].values.tolist()
X_val.drop(cols_to_drop, axis=1, inplace=True)

X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
X_test['h'] = X_test[ cols_to_norm ].values.tolist()
X_test.drop(cols_to_drop, axis=1, inplace=True)

In [56]:
if graph_type == "window" or graph_type == "line":

    create_weightless_window_graph(
        df=X_train,
        dataset=dataset,
        window_size=window_size,
        line_graph=graph_type == "line",
        folder_path=os.path.join(folder_path, "training"),
        edge_attr= ['h', dataset.label_col, dataset.class_num_col],
        file_type="pkl")
    
    create_weightless_window_graph(
        df=X_val,
        dataset=dataset,
        window_size=window_size,
        line_graph=graph_type == "line",
        folder_path=os.path.join(folder_path, "validation"),
        edge_attr= ['h', dataset.label_col, dataset.class_num_col],
        file_type="pkl")
    
    create_weightless_window_graph(
        df=X_test,
        dataset=dataset,
        window_size=window_size,
        line_graph=graph_type == "line",
        folder_path=os.path.join(folder_path, "testing"),
        edge_attr= ['h', dataset.label_col, dataset.class_num_col],
        file_type="pkl")

In [57]:
if graph_type == "flow":
	os.makedirs(folder_path, exist_ok=True)
	print(f"==>> X_train.shape: {X_train.shape}")
	print(f"==>> X_val.shape: {X_val.shape}")
	print(f"==>> X_test.shape: {X_test.shape}")


==>> X_train.shape: (511768, 5)
==>> X_val.shape: (56864, 5)
==>> X_test.shape: (63182, 5)


In [58]:
if graph_type == "flow":
    graph_name = "training_graph"

    G = nx.from_pandas_edgelist(X_train, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col, dataset.class_num_col], create_using=nx.MultiDiGraph())
    
    if use_node_features:
        add_centralities_as_node_features(df=None, G=G, graph_path=None, dataset=dataset, cn_measures=cn_measures)
        
        for node in G.nodes():
            centralities = []
            for centrality in cn_measures:
                centralities.append(G.nodes[node].get(centrality, 0)) # Default to 0 if missing
                
                # Combine features into a single vector
            n_feats = np.array(centralities, dtype=np.float32)
            
            # Add the new feature to the node
            G.nodes[node]["n_feats"] = n_feats
            
    # get netowrk properties
    graph_measures = calculate_graph_measures(G, f"{folder_path}/{graph_name}_measures.json", verbose=True)
    print(f"==>> graph_measures: {graph_measures}")

    # graph_measures = calculate_graph_measures(nx.DiGraph(G), "datasets/" + name + "/training_graph_simple_measures.json", verbose=True)
    # print(f"==>> graph_measures: {graph_measures}")

    with open(f"{folder_path}/{graph_name}.pkl", "wb") as f:
        pickle.dump(G, f)

==>> calculated degrees, in 0.0001368001103401184 seconds
==>> graph_measures: {'number_of_nodes': 107, 'number_of_edges': 511768, 'max_degree': 238453, 'avg_degree': 9565.757009345794, 'density': 45.12149532710281}


In [59]:
if graph_type == "flow":
    graph_name = "validation_graph"

    G = nx.from_pandas_edgelist(X_val, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col, dataset.class_num_col], create_using=nx.MultiDiGraph())
    
    if use_node_features:
        add_centralities_as_node_features(df=None, G=G, graph_path=None, dataset=dataset, cn_measures=cn_measures)
        
        for node in G.nodes():
            centralities = []
            for centrality in cn_measures:
                centralities.append(G.nodes[node].get(centrality, 0)) # Default to 0 if missing
                
                # Combine features into a single vector
            n_feats = np.array(centralities, dtype=np.float32)
            
            # Add the new feature to the node
            G.nodes[node]["n_feats"] = n_feats
            
    # get netowrk properties
    graph_measures = calculate_graph_measures(G, f"{folder_path}/{graph_name}_measures.json", verbose=True)
    print(f"==>> graph_measures: {graph_measures}")

    # graph_measures = calculate_graph_measures(nx.DiGraph(G), "datasets/" + name + "/training_graph_simple_measures.json", verbose=True)
    # print(f"==>> graph_measures: {graph_measures}")

    with open(f"{folder_path}/{graph_name}.pkl", "wb") as f:
        pickle.dump(G, f)

==>> calculated degrees, in 0.00010430067777633667 seconds
==>> graph_measures: {'number_of_nodes': 78, 'number_of_edges': 56864, 'max_degree': 26533, 'avg_degree': 1458.051282051282, 'density': 9.467865467865467}


In [60]:
if graph_type == "flow":
    graph_name = "testing_graph"
    
    G = nx.from_pandas_edgelist(X_test, dataset.src_ip_col, dataset.dst_ip_col, ['h', dataset.label_col, dataset.class_num_col], create_using=nx.MultiDiGraph())
    
    if use_node_features:
        add_centralities_as_node_features(df=None, G=G, graph_path=None, dataset=dataset, cn_measures=cn_measures)
        
        for node in G.nodes():
            centralities = []
            for centrality in cn_measures:
                centralities.append(G.nodes[node].get(centrality, 0)) # Default to 0 if missing
                
                # Combine features into a single vector
            n_feats = np.array(centralities, dtype=np.float32)
            
            # Add the new feature to the node
            G.nodes[node]["n_feats"] = n_feats
            
    graph_measures = calculate_graph_measures(G, f"{folder_path}/{graph_name}_measures.json", verbose=True)
    print(f"==>> graph_measures: {graph_measures}")
    
    # graph_measures = calculate_graph_measures(nx.DiGraph(G_test), "datasets/" + name + "/testing_graph_simple_measures.json", verbose=True)
    # print(f"==>> graph_measures: {graph_measures}")
    
    with open(f"{folder_path}/{graph_name}.pkl", "wb") as f:
        pickle.dump(G, f)

==>> calculated degrees, in 0.00010609999299049377 seconds
==>> graph_measures: {'number_of_nodes': 80, 'number_of_edges': 63182, 'max_degree': 29310, 'avg_degree': 1579.55, 'density': 9.997151898734177}
