Importing Libraries

In [None]:
import os
import pandas as pd
import networkx as nx
import numpy as np
import pickle
from src.network.network_features import cal_betweenness_centrality
from sklearn.model_selection import train_test_split
# datasets is a list of available datasets descriptions containing: path, key columns names, and suitable complex network features
from src.data.dataset_info import datasets



with_sort_timestamp = False
undersample_classes = True
folder_path = "folder path"

if not os.path.isdir(folder_path):
    os.mkdir(folder_path)

## 1. Data loading and cleaning

In [None]:
dataset1 = datasets[0]
print(f"==>> dataset1.name: {dataset1.name}")
df1 = pd.read_parquet("path to the dataset")
# converting all infinity values into nan then dropping all records containing nan values
df1.replace([np.inf, -np.inf], np.nan, inplace=True)
df1.dropna(axis=0, how='any', inplace=True)

df1.drop_duplicates(subset=list(set(df1.columns) - set([dataset1.timestamp_col, dataset1.flow_id_col])), keep="first", inplace=True)

if dataset1.low_classes:
    df1 = df1[~df1[dataset1.class_col].isin(dataset1.low_classes)]

In [None]:
classes1 = df1[dataset1.class_col].unique()
print(classes1)

In [None]:
dataset2 = datasets[1]
print(f"==>> dataset2.name: {dataset2.name}")
df2 = pd.read_parquet("path to the dataset")
# converting all infinity values into nan then dropping all records containing nan values
df2.replace([np.inf, -np.inf], np.nan, inplace=True)
df2.dropna(axis=0, how='any', inplace=True)

df2.drop_duplicates(subset=list(set(df2.columns) - set([dataset2.timestamp_col, dataset2.flow_id_col])), keep="first", inplace=True)

if dataset2.low_classes:
    df2 = df2[~df2[dataset2.class_col].isin(dataset2.low_classes)]

In [None]:
classes2 = df2[dataset2.class_col].unique()
print(f"==>> classes2: {classes2}")

In [None]:
df2[dataset2.class_col] = df2[dataset2.class_col].replace({"BENIGN": "Benign",
                                                            "DDoS": "ddos",
                                                            "Web Attack � Brute Force": "bruteforce",
                                                            "Web Attack � XSS": "xss"})

In [None]:
classes2 = df2[dataset2.class_col].unique()
print(f"==>> classes2: {classes2}")

In [None]:
classes = set(np.concatenate([classes2,classes1]))
print(f"==>> classes: {classes}")

In [None]:
from sklearn.preprocessing import LabelEncoder

# df1.replace([np.inf, -np.inf], np.nan, inplace=True)
# df2.dropna(axis=0, how='any', inplace=True)

# df1.replace([np.inf, -np.inf], np.nan, inplace=True)
# df2.dropna(axis=0, how='any', inplace=True)

if with_sort_timestamp:
    df1[dataset1.timestamp_col] = pd.to_datetime(df1[dataset1.timestamp_col].str.strip(), format=dataset1.timestamp_format)
    df1.sort_values(dataset1.timestamp_col, inplace= True)

if with_sort_timestamp:
    df2[dataset2.timestamp_col] = pd.to_datetime(df2[dataset2.timestamp_col].str.strip(), format=dataset2.timestamp_format)
    df2.sort_values(dataset2.timestamp_col, inplace= True)

label_encoder = LabelEncoder()
label_encoder.fit(list(classes))

df1[dataset1.class_num_col] = label_encoder.transform(df1[dataset1.class_col])
df2[dataset2.class_num_col] = label_encoder.transform(df2[dataset2.class_col])
labels_names = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))

print(f"==>> labels_names: {labels_names}")

In [None]:
if undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df1.groupby(dataset1.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

In [None]:
if undersample_classes:
    # Get the classes with the highest number of records (you can choose how many to undersample)
    classes_to_undersample = class_counts_sorted.index[:2]

    # Undersample the classes with the highest number of records
    dfs = []
    for class_label in class_counts_sorted.index:
        print(f"==>> class_label: {class_label}")
        if class_label in classes_to_undersample:
            class_df = df1[df1[dataset1.class_col] == class_label]
            undersampled_df = class_df.sample(frac=0.5)  # Specify the fraction of samples to keep
            dfs.append(undersampled_df)
        else:
            dfs.append(df1[df1[dataset1.class_col] == class_label])

    df1 = []
    # Optional: shuffle the undersampled DataFrame
    df1 = pd.concat(dfs).sample(frac=1).reset_index(drop=True)


In [None]:
if undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df1.groupby(dataset1.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

In [None]:
if undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df2.groupby(dataset2.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

In [None]:
if undersample_classes:
    # Get the classes with the highest number of records (you can choose how many to undersample)
    classes_to_undersample = class_counts_sorted.index[:1]

    # Undersample the classes with the highest number of records
    dfs = []
    for class_label in class_counts_sorted.index:
        if class_label in classes_to_undersample:
            class_df = df2[df2[dataset2.class_col] == class_label]
            undersampled_df = class_df.sample(frac=0.5)  # Specify the fraction of samples to keep
            dfs.append(undersampled_df)
        else:
            dfs.append(df2[df2[dataset2.class_col] == class_label])

    df2 = []
    # Optional: shuffle the undersampled DataFrame
    df2 = pd.concat(dfs).sample(frac=1).reset_index(drop=True)


In [None]:
if undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df2.groupby(dataset2.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

In [None]:
with open(folder_path + 'labels_names.pkl', 'wb') as f:
    pickle.dump([labels_names, classes], f)

In [None]:
total_count = len(df1)

properties = {
    "name": dataset1.name,
    "length": total_count,
}

num_benign = len(df1[df1['Label'] == 0])
num_attack = len(df1[df1['Label'] == 1])

properties["num_benign"] = num_benign
properties["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

properties["num_attack"] = num_attack
properties["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

properties["attacks"] = list(df1["Attack"].unique())  # .to_list()


G = nx.from_pandas_edgelist(
    df1,
    source=dataset1.src_ip_col,
    target=dataset1.dst_ip_col,
    create_using=nx.DiGraph()
)

print(f"==>> number_of_nodes: {G.number_of_nodes()}")
print(f"==>> number_of_edges: {G.number_of_edges()}")

properties


In [None]:
total_count = len(df2)

properties = {
    "name": dataset1.name,
    "length": total_count,
}

num_benign = len(df2[df2['Label'] == 0])
num_attack = len(df2[df2['Label'] == 1])

properties["num_benign"] = num_benign
properties["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

properties["num_attack"] = num_attack
properties["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

properties["attacks"] = list(df2["Attack"].unique())  # .to_list()


G = nx.from_pandas_edgelist(
    df2,
    source=dataset1.src_ip_col,
    target=dataset1.dst_ip_col,
    create_using=nx.DiGraph()
)

print(f"==>> number_of_nodes: {G.number_of_nodes()}")
print(f"==>> number_of_edges: {G.number_of_edges()}")

properties


In [None]:
train1, test1 = train_test_split(df1, test_size=0.1, shuffle= True, random_state=1, stratify=df1[dataset1.class_col])
train2, test2 = train_test_split(df2, test_size=0.1, shuffle= True, random_state=1, stratify=df2[dataset2.class_col])

In [None]:
def add_centralities(df):
    
    G = nx.from_pandas_edgelist(
        df,
        source=dataset1.src_ip_col,
        target=dataset1.dst_ip_col,
        create_using=nx.MultiDiGraph()
    )

    print(f"===============")
    print(f"==>> number_of_nodes: {G.number_of_nodes()}")
    print(f"==>> number_of_edges: {G.number_of_edges()}")
    print(f"===============")

    degrees = nx.degree_centrality(G)
    betwe = cal_betweenness_centrality(G)
    pagerank = nx.pagerank(G, alpha=0.85)

    df["src_degree"] = df.apply(
                lambda row: degrees.get(row[dataset1.src_ip_col], -1), axis=1)
    df["dst_degree"] = df.apply(
                lambda row: degrees.get(row[dataset1.dst_ip_col], -1), axis=1)
    
    df["src_betweenness"] = df.apply(
                lambda row: betwe.get(row[dataset1.src_ip_col], -1), axis=1)
    df["dst_betweenness"] = df.apply(
                lambda row: betwe.get(row[dataset1.dst_ip_col], -1), axis=1)
    
    df["src_pagerank"] = df.apply(
                lambda row: pagerank.get(row[dataset1.src_ip_col], -1), axis=1)
    df["dst_pagerank"] = df.apply(
                lambda row: pagerank.get(row[dataset1.dst_ip_col], -1), axis=1)
    
    return df

def add_centralities_multidigraph(df):
    
    G = nx.from_pandas_edgelist(
        df,
        source=dataset1.src_ip_col,
        target=dataset1.dst_ip_col,
        create_using=nx.MultiDiGraph()
    )

    print(f"===============")
    print(f"==>> number_of_nodes: {G.number_of_nodes()}")
    print(f"==>> number_of_edges: {G.number_of_edges()}")
    print(f"===============")

    degrees = nx.degree_centrality(G)
    betwe = cal_betweenness_centrality(G)
    pagerank = nx.pagerank(G, alpha=0.85)

    df["src_multidigraph_degree"] = df.apply(
                lambda row: degrees.get(row[dataset1.src_ip_col], -1), axis=1)
    df["dst_multidigraph_degree"] = df.apply(
                lambda row: degrees.get(row[dataset1.dst_ip_col], -1), axis=1)
    
    df["src_multidigraph_betweenness"] = df.apply(
                lambda row: betwe.get(row[dataset1.src_ip_col], -1), axis=1)
    df["dst_multidigraph_betweenness"] = df.apply(
                lambda row: betwe.get(row[dataset1.dst_ip_col], -1), axis=1)
    
    df["src_multidigraph_pagerank"] = df.apply(
                lambda row: pagerank.get(row[dataset1.src_ip_col], -1), axis=1)
    df["dst_multidigraph_pagerank"] = df.apply(
                lambda row: pagerank.get(row[dataset1.dst_ip_col], -1), axis=1)
    
    return df

In [None]:
test = pd.concat([test1, test2])
test = add_centralities(test)
test = add_centralities_multidigraph(test)
test.to_parquet(folder_path + "test.parquet")

In [None]:
client_data = np.array_split(train1, 5) + np.array_split(train2, 3)

for cid, data_partition in enumerate(client_data):
    
    data_partition = add_centralities(data_partition)
    data_partition = add_centralities_multidigraph(data_partition)

    data_partition.to_parquet(folder_path + "client_{}.parquet".format(cid))