In [1]:
import os
import pandas as pd
import networkx as nx
import numpy as np
import pickle
import json
import shutil

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# datasets is a list of available datasets descriptions containing: path, key columns names, and suitable complex network features
from src.dataset.dataset_info import datasets, cn_measures_type_1, cn_measures_type_2, cn_measures_type_3, cn_measures_type_4, network_features_type_1,network_features_type_2,network_features_type_3,network_features_type_4
from src.graph.graph_measures import calculate_graph_measures
from src.graph.centralities.add_centralities import add_centralities
from src.dataset.add_pca_columns import process_clients_with_grouped_pca, evaluate_pca_results, process_clients_with_pca, process_clients_with_grouped_pca_rmse


with_sort_timestamp = True
undersample_classes = True
folder_path = "temp/"
folder_path_prep = "temp/preprocessed/"
output_folder = 'datasets/gdlc'
# output_folder = 'datasets/dbp'


if not os.path.isdir(folder_path):
    os.mkdir(folder_path)
    os.mkdir(folder_path_prep)
    os.mkdir(output_folder)

# Preparing Datasets

### Reading and Cleaning

In [2]:
dataset1 = datasets["cic_ton_iot_5_percent"]
df1 = pd.read_parquet(dataset1.path)

dataset2 = datasets["cic_ids_2017_5_percent"]
df2 = pd.read_parquet(dataset2.path)

==>> dataset1.name: cic_ton_iot_5_percent


### Attacks Types

In [4]:
classes1 = df1[dataset1.class_col].unique()
print(classes1)

['Benign' 'xss' 'password' 'scanning' 'injection' 'ransomware' 'mitm'
 'backdoor' 'ddos' 'dos']


In [5]:
classes2 = df2[dataset2.class_col].unique()
print(f"==>> classes2: {classes2}")

==>> classes2: ['BENIGN' 'PortScan' 'DoS Hulk' 'DoS slowloris' 'DDoS' 'DoS Slowhttptest'
 'FTP-Patator' 'SSH-Patator' 'DoS GoldenEye' 'Web Attack � Brute Force'
 'Infiltration' 'Bot' 'Web Attack � XSS' 'Web Attack � Sql Injection']


renaming some attacks to fit the naming in df1

In [6]:
df2[dataset2.class_col] = df2[dataset2.class_col].replace({"BENIGN": "Benign",
                                                            "DDoS": "ddos",
                                                            "Web Attack � Brute Force": "bruteforce",
                                                            "Web Attack � XSS": "xss"})

In [7]:
classes2 = df2[dataset2.class_col].unique()
print(f"==>> classes2: {classes2}")

==>> classes2: ['Benign' 'PortScan' 'DoS Hulk' 'DoS slowloris' 'ddos' 'DoS Slowhttptest'
 'FTP-Patator' 'SSH-Patator' 'DoS GoldenEye' 'bruteforce' 'Infiltration'
 'Bot' 'xss' 'Web Attack � Sql Injection']


In [8]:
classes = set(np.concatenate([classes2,classes1]))
print(f"==>> classes: {classes}")

==>> classes: {'scanning', 'xss', 'Infiltration', 'Benign', 'PortScan', 'DoS Hulk', 'Bot', 'dos', 'password', 'DoS slowloris', 'bruteforce', 'FTP-Patator', 'backdoor', 'DoS Slowhttptest', 'mitm', 'DoS GoldenEye', 'injection', 'ransomware', 'SSH-Patator', 'ddos', 'Web Attack � Sql Injection'}


### Sorting (optional)

In [9]:
if with_sort_timestamp:
    df1[dataset1.timestamp_col] = pd.to_datetime(df1[dataset1.timestamp_col].str.strip(), format=dataset1.timestamp_format)
    df1.sort_values(dataset1.timestamp_col, inplace= True)

if with_sort_timestamp:
    df2[dataset2.timestamp_col] = pd.to_datetime(df2[dataset2.timestamp_col].str.strip(), format=dataset2.timestamp_format)
    df2.sort_values(dataset2.timestamp_col, inplace= True)

### Encoding Attacks into integers

In [10]:
label_encoder = LabelEncoder()
label_encoder.fit(list(classes))

df1[dataset1.class_num_col] = label_encoder.transform(df1[dataset1.class_col])
df2[dataset2.class_num_col] = label_encoder.transform(df2[dataset2.class_col])
labels_names = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))

print(f"==>> labels_names: {labels_names}")

==>> labels_names: {0: 'Benign', 1: 'Bot', 2: 'DoS GoldenEye', 3: 'DoS Hulk', 4: 'DoS Slowhttptest', 5: 'DoS slowloris', 6: 'FTP-Patator', 7: 'Infiltration', 8: 'PortScan', 9: 'SSH-Patator', 10: 'Web Attack � Sql Injection', 11: 'backdoor', 12: 'bruteforce', 13: 'ddos', 14: 'dos', 15: 'injection', 16: 'mitm', 17: 'password', 18: 'ransomware', 19: 'scanning', 20: 'xss'}


### Undersampling classes (optional)

In [11]:
if undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df1.groupby(dataset1.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

==>> class_counts_sorted: Attack
Benign        125686
xss           107651
password       16850
injection      13876
scanning        1768
backdoor        1380
ransomware       260
mitm              30
dos                6
ddos               5
dtype: int64


In [12]:
if undersample_classes:
    # Get the classes with the highest number of records (you can choose how many to undersample)
    classes_to_undersample = class_counts_sorted.index[:2]

    # Undersample the classes with the highest number of records
    dfs = []
    for class_label in class_counts_sorted.index:
        print(f"==>> class_label: {class_label}")
        if class_label in classes_to_undersample:
            class_df = df1[df1[dataset1.class_col] == class_label]
            undersampled_df = class_df.sample(frac=0.5)  # Specify the fraction of samples to keep
            dfs.append(undersampled_df)
        else:
            dfs.append(df1[df1[dataset1.class_col] == class_label])

    df1 = []
    # Optional: shuffle the undersampled DataFrame
    df1 = pd.concat(dfs).sample(frac=1).reset_index(drop=True)


==>> class_label: Benign
==>> class_label: xss
==>> class_label: password
==>> class_label: injection
==>> class_label: scanning
==>> class_label: backdoor
==>> class_label: ransomware
==>> class_label: mitm
==>> class_label: dos
==>> class_label: ddos


In [13]:
if undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df1.groupby(dataset1.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

==>> class_counts_sorted: Attack
Benign        62843
xss           53826
password      16850
injection     13876
scanning       1768
backdoor       1380
ransomware      260
mitm             30
dos               6
ddos              5
dtype: int64


In [14]:
if undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df2.groupby(dataset2.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

==>> class_counts_sorted: Attack
Benign                        113501
DoS Hulk                       11512
PortScan                        7916
ddos                            6391
DoS GoldenEye                    519
FTP-Patator                      377
SSH-Patator                      289
DoS slowloris                    283
DoS Slowhttptest                 259
Bot                               87
bruteforce                        76
xss                               36
Infiltration                       6
Web Attack � Sql Injection         3
dtype: int64


In [15]:
if undersample_classes:
    # Get the classes with the highest number of records (you can choose how many to undersample)
    classes_to_undersample = class_counts_sorted.index[:1]

    # Undersample the classes with the highest number of records
    dfs = []
    for class_label in class_counts_sorted.index:
        if class_label in classes_to_undersample:
            class_df = df2[df2[dataset2.class_col] == class_label]
            undersampled_df = class_df.sample(frac=0.5)  # Specify the fraction of samples to keep
            dfs.append(undersampled_df)
        else:
            dfs.append(df2[df2[dataset2.class_col] == class_label])

    df2 = []
    # Optional: shuffle the undersampled DataFrame
    df2 = pd.concat(dfs).sample(frac=1).reset_index(drop=True)


In [16]:
if undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df2.groupby(dataset2.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

==>> class_counts_sorted: Attack
Benign                        56750
DoS Hulk                      11512
PortScan                       7916
ddos                           6391
DoS GoldenEye                   519
FTP-Patator                     377
SSH-Patator                     289
DoS slowloris                   283
DoS Slowhttptest                259
Bot                              87
bruteforce                       76
xss                              36
Infiltration                      6
Web Attack � Sql Injection        3
dtype: int64


### saving labels encodings and datasets properties

In [17]:
with open(folder_path + '/labels_names.pkl', 'wb') as f:
    pickle.dump([labels_names, classes], f)

In [18]:
total_count = len(df1)

properties = {
    "name": dataset1.name,
    "length": total_count,
}

num_benign = len(df1[df1['Label'] == 0])
num_attack = len(df1[df1['Label'] == 1])

properties["num_benign"] = num_benign
properties["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

properties["num_attack"] = num_attack
properties["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

properties["attacks"] = list(df1["Attack"].unique()) 


G = nx.from_pandas_edgelist(
    df1,
    source=dataset1.src_ip_col,
    target=dataset1.dst_ip_col,
    create_using=nx.DiGraph()
)

properties["number_of_nodes"] = G.number_of_nodes() 
properties["number_of_edges"] = G.number_of_edges()

with open(folder_path + '/df1_properties.txt', 'w') as f:
    json.dump(properties, f)
    
properties


{'name': 'cic_ton_iot_5_percent',
 'length': 150844,
 'num_benign': 62843,
 'percentage_of_benign_records': 41.66092121662115,
 'num_attack': 88001,
 'percentage_of_attack_records': 58.33907878337885,
 'attacks': ['xss',
  'Benign',
  'injection',
  'ransomware',
  'password',
  'backdoor',
  'scanning',
  'mitm',
  'ddos',
  'dos'],
 'number_of_nodes': 12283,
 'number_of_edges': 12895}

In [19]:
total_count = len(df2)

properties = {
    "name": dataset1.name,
    "length": total_count,
}

num_benign = len(df2[df2['Label'] == 0])
num_attack = len(df2[df2['Label'] == 1])

properties["num_benign"] = num_benign

properties["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

properties["num_attack"] = num_attack
properties["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

properties["attacks"] = list(df2["Attack"].unique())  # .to_list()


G = nx.from_pandas_edgelist(
    df2,
    source=dataset1.src_ip_col,
    target=dataset1.dst_ip_col,
    create_using=nx.DiGraph()
)

properties["number_of_nodes"] = G.number_of_nodes() 
properties["number_of_edges"] = G.number_of_edges()

with open(folder_path + '/df2_properties.txt', 'w') as f:
    json.dump(properties, f)

properties


{'name': 'cic_ton_iot_5_percent',
 'length': 84504,
 'num_benign': 56750,
 'percentage_of_benign_records': 67.1565843037016,
 'num_attack': 27754,
 'percentage_of_attack_records': 32.8434156962984,
 'attacks': ['Benign',
  'ddos',
  'DoS GoldenEye',
  'PortScan',
  'DoS Hulk',
  'DoS slowloris',
  'FTP-Patator',
  'Bot',
  'DoS Slowhttptest',
  'SSH-Patator',
  'xss',
  'bruteforce',
  'Infiltration',
  'Web Attack � Sql Injection'],
 'number_of_nodes': 7520,
 'number_of_edges': 16778}

# Splitting into Clients

### creating main training and testing splits

test parts will be concatenated to create the main testing df

train parts will be further splitted into clients

In [20]:
train1, test1 = train_test_split(df1, test_size=0.1, shuffle= True, random_state=1, stratify=df1[dataset1.class_col])
train2, test2 = train_test_split(df2, test_size=0.1, shuffle= True, random_state=1, stratify=df2[dataset2.class_col])


if with_sort_timestamp:
    # train1[dataset1.timestamp_col] = pd.to_datetime(train1[dataset1.timestamp_col].str.strip(), format=dataset1.timestamp_format)
    train1.sort_values(dataset1.timestamp_col, inplace= True)
    train2.sort_values(dataset2.timestamp_col, inplace= True)


### Computing graph-level measures (to apply GDLC)

In [21]:
# split dfs into clients
client_data = np.array_split(train1, 5) + np.array_split(train2, 3)

graphs_properties_path = os.path.join(output_folder, 'graphs_properties')

for cid, data_partition in enumerate(client_data):
    data_partition.to_parquet(
        folder_path + "client_{}.parquet".format(cid))

    G = nx.from_pandas_edgelist(
        data_partition, source=dataset1.src_ip_col, target=dataset1.dst_ip_col, create_using=nx.Graph())
    properties = calculate_graph_measures(G, f"client_{cid}", graphs_properties_path)
    print(f"Computed properties for client_{cid}: {properties}")


    
test = pd.concat([test1, test2])
test.to_parquet(folder_path + "test.parquet")

G_test = nx.from_pandas_edgelist(
    test, source=dataset1.src_ip_col, target=dataset1.dst_ip_col, create_using=nx.Graph())
test_properties = calculate_graph_measures(G_test, "test", graphs_properties_path)

print(f"Computed properties for test dataset: {test_properties}")

  return bound(*args, **kwds)


### Adding Centralities

Specifying the centralities to add, according to which branch (type) from the four branches of GDLC

In [22]:
cn_measures_types = [
    cn_measures_type_1,
    cn_measures_type_3,
    cn_measures_type_3,
    cn_measures_type_3,
    cn_measures_type_1,
    cn_measures_type_2,
    cn_measures_type_1,
    cn_measures_type_1,
    cn_measures_type_2,
]

network_features_types = [
    network_features_type_1,
    network_features_type_3,
    network_features_type_3,
    network_features_type_3,
    network_features_type_1,
    network_features_type_2,
    network_features_type_1,
    network_features_type_1,
    network_features_type_2,
]

# homogeneous clients, using same centralities 

# cn_measures_type_0 = ["betweenness", "degree", "pagerank"]
# network_features_type_0 = ['src_betweenness', 'dst_betweenness', 'src_degree', 'dst_degree', 'src_pagerank', 'dst_pagerank']


# cn_measures_types = [
#     cn_measures_type_0,
#     cn_measures_type_0,
#     cn_measures_type_0,
#     cn_measures_type_0,
#     cn_measures_type_0,
#     cn_measures_type_0,
#     cn_measures_type_0,
#     cn_measures_type_0,
#     cn_measures_type_0,
# ]

# network_features_types = [
#     network_features_type_0,
#     network_features_type_0,
#     network_features_type_0,
#     network_features_type_0,
#     network_features_type_0,
#     network_features_type_0,
#     network_features_type_0,
#     network_features_type_0,
#     network_features_type_0,
# ]

In [23]:
client_filenames = [
    "client_0.parquet",
    "client_1.parquet",
    "client_2.parquet",
    "client_3.parquet",
    "client_4.parquet",
    "client_5.parquet",
    "client_6.parquet",
    "client_7.parquet",
    "test.parquet"
]
clients_paths = [os.path.join(folder_path, name) for name in client_filenames]

In [24]:
centralities_columns = []
def process_dataset(name, path, dataset, cn_measures, network_features):
    print("Processing dataset: {}".format(name))
    new_path = os.path.join(folder_path_prep, "{}.parquet".format(name))
    graph_path = os.path.join(folder_path_prep,"graphs","graph_{}.gexf".format(name))
    os.makedirs(os.path.dirname(new_path), exist_ok=True)
    os.makedirs(os.path.dirname(graph_path), exist_ok=True)
    
    df = pd.read_parquet(path)
    # df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # df.dropna(axis=0, how='any', inplace=True)
    # df.drop_duplicates(subset=list(set(df.columns) - set([dataset.timestamp_col, dataset.flow_id_col])), keep="first", inplace=True)
    
    columns = add_centralities(df, new_path, graph_path, dataset, cn_measures, network_features)
    centralities_columns.append(columns)

In [26]:
process_dataset("client_0", clients_paths[0], datasets["cic_ton_iot_5_percent"], cn_measures_types[0], network_features_types[0])
process_dataset("client_1", clients_paths[1], datasets["cic_ton_iot_5_percent"], cn_measures_types[1], network_features_types[1])
process_dataset("client_2", clients_paths[2], datasets["cic_ton_iot_5_percent"], cn_measures_types[2], network_features_types[2])
process_dataset("client_3", clients_paths[3], datasets["cic_ton_iot_5_percent"], cn_measures_types[3], network_features_types[3])
process_dataset("client_4", clients_paths[4], datasets["cic_ton_iot_5_percent"], cn_measures_types[4], network_features_types[4])
process_dataset("client_5", clients_paths[5], datasets["cic_ids_2017_5_percent"], cn_measures_types[5], network_features_types[5])
process_dataset("client_6", clients_paths[6], datasets["cic_ids_2017_5_percent"], cn_measures_types[6], network_features_types[6])
process_dataset("client_7", clients_paths[7], datasets["cic_ids_2017_5_percent"], cn_measures_types[7], network_features_types[7])
process_dataset("test", clients_paths[8], datasets["cic_ids_2017_5_percent"], cn_measures_types[8], network_features_types[8])

Processing dataset: client_0
calculated betweenness
calculated local_betweenness
calculated degree
calculated local_degree
calculated eigenvector
calculated closeness
calculated pagerank
calculated local_pagerank
calculated k_core
calculated k_truss
calculated Comm
==>> features_dicts: ('betweenness', 4890)
==>> features_dicts: ('local_betweenness', 4890)
==>> features_dicts: ('degree', 4890)
==>> features_dicts: ('local_degree', 4890)
==>> features_dicts: ('eigenvector', 4890)
==>> features_dicts: ('closeness', 4890)
==>> features_dicts: ('pagerank', 4890)
==>> features_dicts: ('local_pagerank', 4890)
==>> features_dicts: ('k_core', 4890)
==>> features_dicts: ('k_truss', 4890)
==>> features_dicts: ('Comm', 4890)
DataFrame written to temp/preprocessed/client_0.parquet
Processing dataset: client_1
calculated betweenness
calculated local_betweenness
calculated pagerank
calculated local_pagerank
calculated k_core
calculated k_truss
calculated Comm
==>> features_dicts: ('betweenness', 44)


### Adding PCA columns

In [27]:
from collections import defaultdict

all_centrality_measures = set([
    'src_degree', 'dst_degree', 'dst_betweenness', 'pagerank', 'dst_local_pagerank',
    'src_k_core', 'dst_mv', 'global_betweenness', 'src_local_pagerank', 'dst_local_degree',
    'dst_global_pagerank', 'betweenness', 'local_pagerank', 'eigenvector', 'src_global_degree',
    'src_global_betweenness', 'src_closeness', 'global_pagerank', 'local_betweenness',
    'dst_eigenvector', 'dst_global_betweenness', 'src_global_pagerank', 'k_truss', 'global_degree',
    'src_local_degree', 'degree', 'dst_closeness', 'dst_k_truss', 'dst_global_degree', 'dst_pagerank',
    'local_degree', 'src_pagerank', 'src_Comm', 'dst_local_betweenness', 'src_k_truss', 'dst_k_core',
    'closeness', 'src_mv', 'Comm', 'src_eigenvector', 'dst_Comm', 'src_betweenness', 'k_core',
    'src_local_betweenness', 'mv'
])

clients_paths = [
    folder_path_prep + "client_0.parquet",
    folder_path_prep + "client_1.parquet",
    folder_path_prep + "client_2.parquet",
    folder_path_prep + "client_3.parquet",
    folder_path_prep + "client_4.parquet",
    folder_path_prep + "client_5.parquet",
    folder_path_prep + "client_6.parquet",
    folder_path_prep + "client_7.parquet",
    folder_path_prep + "test.parquet",
]

client_features = {}

for client_path in clients_paths:
    df = pd.read_parquet(client_path)
    features = set(df.columns)
    centrality_features = features.intersection(all_centrality_measures)
    client_features[client_path] = centrality_features

feature_groups = defaultdict(list)
for client_path, features in client_features.items():
    feature_groups[frozenset(features)].append(client_path)

for i, (unique_feature_set, clients) in enumerate(feature_groups.items(), 1):
    print(f"Unique Centrality Feature Set Group {i}:")
    print(f"Centrality Features: {set(unique_feature_set)}")
    print(f"Clients: {clients}")
    print("----------")

pca_results, pca_columns = process_clients_with_grouped_pca_rmse(
    feature_groups,
    output_folder,
    n_components=7
)



Unique Centrality Feature Set Group 1:
Centrality Features: {'dst_k_truss', 'src_pagerank', 'dst_Comm', 'dst_local_betweenness', 'src_closeness', 'src_betweenness', 'src_k_truss', 'src_local_betweenness', 'src_k_core', 'dst_local_degree', 'dst_closeness', 'src_eigenvector', 'dst_pagerank', 'dst_k_core', 'src_local_degree', 'dst_local_pagerank', 'src_local_pagerank', 'dst_eigenvector', 'src_degree', 'dst_degree', 'dst_betweenness', 'src_Comm'}
Clients: ['temp/preprocessed/client_0.parquet', 'temp/preprocessed/client_4.parquet', 'temp/preprocessed/client_6.parquet', 'temp/preprocessed/client_7.parquet']
----------
Unique Centrality Feature Set Group 2:
Centrality Features: {'dst_k_truss', 'src_pagerank', 'src_Comm', 'dst_k_core', 'dst_Comm', 'dst_local_betweenness', 'src_betweenness', 'src_local_pagerank', 'src_k_truss', 'src_local_betweenness', 'dst_local_pagerank', 'dst_betweenness', 'src_k_core', 'dst_pagerank'}
Clients: ['temp/preprocessed/client_1.parquet', 'temp/preprocessed/client

In [28]:
shutil.copy(os.path.join(folder_path, 'labels_names.pkl'), os.path.join(output_folder, '/labels_names.pkl'))
with open(output_folder + '/added_columns.pkl', 'wb') as f:
    pickle.dump([centralities_columns, pca_columns], f)

In [29]:
# clients_paths = [
#     output_folder + "/client_0.parquet",
#     output_folder + "/client_1.parquet",
#     output_folder + "/client_2.parquet",
#     output_folder + "/client_3.parquet",
#     output_folder + "/client_4.parquet",
#     output_folder + "/client_5.parquet",
#     output_folder + "/client_6.parquet",
#     output_folder + "/client_7.parquet",
#     output_folder + "/test.parquet"

# ]

# evaluate_pca_results(clients_paths)