In [None]:
import pandas as pd
import numpy as np
import os
import pickle

import networkx as nx

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from src.dataset.dataset_info import datasets
from src.graph.graph_measures import calculate_graph_measures
from src.graph.graph_construction.window_graph import create_weightless_window_graph
from src.graph.graph_construction.session_graph import define_sessions, create_weightless_session_graph
from src.graph.centralities.add_centralities import add_centralities_as_node_features

In [None]:
flow_graph = True
with_centralities = True

cn_measures = ["betweenness", "degree", "pagerank", "closeness", "k_truss"]

session_graph = False

window_graph = False
window_size= 2000

multi_graph=False
line_graph = False
sort_timestamp = False

In [None]:
# name = "cic_ton_iot_5_percent"
# name = "cic_ton_iot"
name = "cic_ids_2017_5_percent"
# name = "cic_ids_2017"
# name = "cic_bot_iot"
# name = "cic_ton_iot_modified"
# name = "nf_ton_iotv2_modified"
# name = "ccd_inid_modified"
# name = "nf_uq_nids_modified"
# name = "edge_iiot"
# name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
# name = "nf_uq_nids"
# name = "x_iiot"

dataset = datasets[name]

In [None]:
df = pd.read_parquet(dataset.path)

In [None]:
df.head()

In [None]:
cols_to_norm = set(list(df.columns))  - set(list([dataset.label_col, dataset.class_num_col])) - set(dataset.drop_columns)  - set(dataset.weak_columns)
# if with_centralities:
#     cols_to_norm |= set(network_features)
cols_to_norm = list(cols_to_norm)
cols_to_norm

In [None]:
df[dataset.label_col].value_counts()

In [None]:
if sort_timestamp:
    df[dataset.timestamp_col] = pd.to_datetime(df[dataset.timestamp_col].str.strip(), format=dataset.timestamp_format)
    df.sort_values(dataset.timestamp_col, inplace=True)

In [None]:
if window_graph and line_graph:
    
    df.reset_index(drop=True, inplace=True)
    df.reset_index(drop=False, inplace=True)
    
    scaler = StandardScaler()

    df[cols_to_norm] = scaler.fit_transform(df[cols_to_norm])
    df['h'] = df[ cols_to_norm ].values.tolist()
    
    folder_path="datasets/" + name + "/line_graph_unsorted"
    if sort_timestamp:
        folder_path="datasets/" + name + "/line_graph_sorted"
        
    create_weightless_window_graph(
        df=df,
        src_ip_col=dataset.src_ip_col,
        dst_ip_col=dataset.dst_ip_col,
        window_size=window_size,
        line_graph=True,
        folder_path=folder_path,
        # folder_path=None,
        # test_percentage = 20,
        edge_attr= ['h',dataset.label_col, "index"],
        file_type="pkl")    

In [None]:
if session_graph:
    folder_path="datasets/" + name + "/session_graphs"
        
    df2 = define_sessions(df,
                          src_ip_col=dataset.src_ip_col,
                          src_port_col=dataset.src_port_col,
                          dst_ip_col=dataset.dst_ip_col,
                          dst_port_col=dataset.dst_port_col,
                        #   protocol_col=dataset.
    )
    
    create_weightless_session_graph(df,
                                    src_ip_col = dataset.src_ip_col,
                                    dst_ip_col = dataset.dst_ip_col,
                                    multi_graph=multi_graph,
                                    line_graph=line_graph,
                                    folder_path=folder_path)

In [None]:
if not window_graph:
	y = df[dataset.label_col]
 
	if sort_timestamp:
		X_train, X_test, y_train, y_test = train_test_split(
			df, y, test_size=0.3)
	else:
		X_train, X_test, y_train, y_test = train_test_split(
			df, y, test_size=0.3, random_state=13, stratify=y)

	del df

	print(f"==>> X_train.shape: {X_train.shape}")
	print(f"==>> X_train.shape: {X_test.shape}")

	scaler = StandardScaler()
 
	print(f"==>> after add_centralities:")
	print(f"==>> X_train.shape: {X_train.shape}")
	print(f"==>> X_train.shape: {X_test.shape}")
 
	X_train[cols_to_norm] = scaler.fit_transform(X_train[cols_to_norm])
	X_train['h'] = X_train[ cols_to_norm ].values.tolist()

	X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
	X_test['h'] = X_test[ cols_to_norm ].values.tolist()

In [None]:
if flow_graph:
    G = nx.from_pandas_edgelist(X_train, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col], create_using=nx.MultiDiGraph())

    add_centralities_as_node_features(df=None, G=G, graph_path=None, dataset=dataset, cn_measures=cn_measures)
    
    for node in G.nodes():
        centralities = []
        for centrality in cn_measures:
            centralities.append(G.nodes[node].get(centrality, 0)) # Default to 0 if missing
            
            # Combine features into a single vector
        n_feats = np.array(centralities, dtype=np.float32)
        
        # Add the new feature to the node
        G.nodes[node]["n_feats"] = n_feats
        
        
    # get netowrk properties
    graph_measures = calculate_graph_measures(G, "datasets/" + name + "/training_graph_measures.json", verbose=True)
    print(f"==>> graph_measures: {graph_measures}")

    # graph_measures = calculate_graph_measures(nx.DiGraph(G), "datasets/" + name + "/training_graph_simple_measures.json", verbose=True)
    # print(f"==>> graph_measures: {graph_measures}")

    with open("datasets/" + name + "/training_graph_node_features.pkl", "wb") as f:
        pickle.dump(G, f)

In [None]:
if flow_graph:
    G_test = nx.from_pandas_edgelist(X_test, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col],create_using=nx.MultiDiGraph())
    
    add_centralities_as_node_features(df=None, G=G_test, graph_path=None, dataset=dataset, cn_measures=cn_measures)
    
    for node in G_test.nodes():
        centralities = []
        for centrality in cn_measures:
            centralities.append(G_test.nodes[node].get(centrality, 0)) # Default to 0 if missing
            
            # Combine features into a single vector
        n_feats = np.array(centralities, dtype=np.float32)
        
        # Add the new feature to the node
        G_test.nodes[node]["n_feats"] = n_feats
        
    graph_measures = calculate_graph_measures(G_test, "datasets/" + name + "/testing_graph_measures.json", verbose=True)
    print(f"==>> graph_measures: {graph_measures}")
    
    # graph_measures = calculate_graph_measures(nx.DiGraph(G_test), "datasets/" + name + "/testing_graph_simple_measures.json", verbose=True)
    # print(f"==>> graph_measures: {graph_measures}")
    
    with open("datasets/" + name + "/testing_graph_node_features.pkl", "wb") as f:
        pickle.dump(G_test, f)

In [None]:
# if window_graph and line_graph:

#     create_weightless_window_graph(
#         df=X_test,
#         src_ip_col=dataset.src_ip_col,
#         dst_ip_col=dataset.dst_ip_col,
#         window_size=window_size,
#         line_graph=True,
#         folder_path="datasets/" + name + "/line_graph/testing",
#         file_type="pkl")

In [None]:
# nx.write_gexf(G_test, "datasets/cic_ton_iot/testing_graph.gexf")