In [1]:
%load_ext autoreload

import os
import random
import socket
import struct

import networkx as nx
import pandas as pd
from sklearn.model_selection import train_test_split

from src.dataset.dataset_info import datasets, cn_measures, network_features
from src.graph.graph_measures import calculate_graph_measures, find_communities
from src.graph.centralities import add_centralities
from local_variables import local_datasets_path


In [2]:
multi_class = True

sort_timestamp = False
sort_after_partition = True

use_port_in_address = False
generated_ips = False

validation_size = 0.1
test_size = 0.1

In [3]:
# name = "cic_ton_iot_5_percent"
# name = "cic_ton_iot"
# name = "cic_ids_2017_5_percent"
# name = "cic_ids_2017"
name = "cic_bot_iot"
# name = "cic_ton_iot_modified"
# name = "nf_ton_iotv2_modified"
# name = "ccd_inid_modified"
# name = "nf_uq_nids_modified"
# name = "edge_iiot"
# name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
# name = "nf_uq_nids"
# name = "x_iiot"

dataset = datasets[name]

dataset_folder = os.path.join(local_datasets_path, name)
dataset_folder

'C:\\Users\\Administrateur\\Desktop\\datasets\\cic_ton_iot'

In [4]:
exp_type = "gdlc"

if multi_class:
    exp_type += "__multi_class"
    
if use_port_in_address:
    exp_type += "__ports"
    
if generated_ips:
    exp_type += "__generated_ips"
    
if sort_timestamp:
    exp_type += "__sorted"
elif sort_after_partition:
    exp_type += "__semisorted"
else:
    exp_type += "__unsorted"
    
new_folder_path = os.path.join(dataset_folder, exp_type)
new_folder_path

'C:\\Users\\Administrateur\\Desktop\\datasets\\cic_ton_iot\\gdlc__multi_class__semisorted'

In [5]:
df = pd.read_parquet(os.path.join(dataset_folder, f"{name}.parquet"))
if generated_ips:
    df[dataset.src_ip_col] = df[dataset.src_ip_col].apply(lambda x: socket.inet_ntoa(struct.pack('>I', random.randint(0xac100001, 0xac1f0001))))
if sort_timestamp:
    df[dataset.timestamp_col] = pd.to_datetime(df[dataset.timestamp_col].str.strip(), format=dataset.timestamp_format)
    df.sort_values(dataset.timestamp_col, inplace=True)
if use_port_in_address:
    df[dataset.src_port_col] = df[dataset.src_port_col].astype(float).astype(int).astype(str) # to remove the decimal point
    df[dataset.src_ip_col] = df[dataset.src_ip_col] + ':' + df[dataset.src_port_col]

    df[dataset.dst_port_col] = df[dataset.dst_port_col].astype(float).astype(int).astype(str) # to remove the decimal point
    df[dataset.dst_ip_col] = df[dataset.dst_ip_col] + ':' + df[dataset.dst_port_col]

In [6]:
# G = nx.from_pandas_edgelist(
#     df, source=dataset.src_ip_col, target=dataset.dst_ip_col, create_using=nx.DiGraph)
# G.remove_nodes_from(list(nx.isolates(G)))
# for node in G.nodes():
#     G.nodes[node]['label'] = node
# G1, part, communities = find_communities(G, verbose=True)
# calculate_graph_measures(G, communities=communities, verbose=True)

In [7]:
if multi_class:
    y = df[dataset.class_num_col]
else:
    y = df[dataset.label_col]

if sort_timestamp:
    X_tr, X_test, y_tr, y_test = train_test_split(
        df, y, test_size=test_size)
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_tr, y_tr, test_size=validation_size)
else:
    X_tr, X_test, y_tr, y_test = train_test_split(
        df, y, test_size=test_size, random_state=13, stratify=y)
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_tr, y_tr, test_size=validation_size, random_state=13, stratify=y_tr)

del df

if sort_after_partition:
    X_train[dataset.timestamp_col] = pd.to_datetime(X_train[dataset.timestamp_col].str.strip(), format=dataset.timestamp_format)
    X_train.sort_values(dataset.timestamp_col, inplace=True)

    X_val[dataset.timestamp_col] = pd.to_datetime(X_val[dataset.timestamp_col].str.strip(), format=dataset.timestamp_format)
    X_val.sort_values(dataset.timestamp_col, inplace=True)
    
    X_test[dataset.timestamp_col] = pd.to_datetime(X_test[dataset.timestamp_col].str.strip(), format=dataset.timestamp_format)
    X_test.sort_values(dataset.timestamp_col, inplace=True)

In [8]:
# os.makedirs(new_folder_path, exist_ok=True)
# X_train.to_parquet(os.path.join(new_folder_path, "training.parquet"))
# X_val.to_parquet(os.path.join(new_folder_path, "validation.parquet"))
# X_test.to_parquet(os.path.join(new_folder_path, "testing.parquet"))

In [9]:
G_train = nx.from_pandas_edgelist(
    X_train, source=dataset.src_ip_col, target=dataset.dst_ip_col, create_using=nx.DiGraph)
G_train.remove_nodes_from(list(nx.isolates(G_train)))
for node in G_train.nodes():
    G_train.nodes[node]['label'] = node
G1_train, part_train, communities_train = find_communities(G_train, verbose=True)
calculate_graph_measures(G_train, communities=communities_train, verbose=True)

G_val = nx.from_pandas_edgelist(
    X_val, source=dataset.src_ip_col, target=dataset.dst_ip_col, create_using=nx.DiGraph)
G_val.remove_nodes_from(list(nx.isolates(G_val)))
for node in G_val.nodes():
    G_val.nodes[node]['label'] = node
G1_val, part_val, communities_val = find_communities(G_val, verbose=True)
calculate_graph_measures(G_val, communities=communities_val, verbose=True)

G_test = nx.from_pandas_edgelist(
    X_test, source=dataset.src_ip_col, target=dataset.dst_ip_col, create_using=nx.DiGraph)
G_test.remove_nodes_from(list(nx.isolates(G_test)))
for node in G_test.nodes():
    G_test.nodes[node]['label'] = node
G1_test, part_test, communities_test = find_communities(G_test, verbose=True)
calculate_graph_measures(G_test, communities=communities_test, verbose=True)

In [None]:
dataset.centralities_set = 2
dataset.centralities_set

In [None]:
os.makedirs(new_folder_path, exist_ok=True)

print("===================")
print("training:")
add_centralities(df = X_train, new_path=os.path.join(new_folder_path, "training.parquet"), graph_path=None, dataset=dataset, cn_measures=cn_measures[dataset.centralities_set-1], network_features=network_features[dataset.centralities_set-1], G=G_train, communities=communities_train, G1=G1_train, part=part_train)

print("===================")
print("validation:")
add_centralities(df = X_val, new_path=os.path.join(new_folder_path, "validation.parquet"), graph_path=None, dataset=dataset, cn_measures=cn_measures[dataset.centralities_set-1], network_features=network_features[dataset.centralities_set-1], G=G_val, communities=communities_val, G1=G1_val, part=part_val)

print("===================")
print("testing:")
add_centralities(df = X_test, new_path=os.path.join(new_folder_path, "testing.parquet"), graph_path=None, dataset=dataset, cn_measures=cn_measures[dataset.centralities_set-1], network_features=network_features[dataset.centralities_set-1], G=G_test, communities=communities_test, G1=G1_test, part=part_test)