In [1]:
import pandas as pd
import os
import pickle

import networkx as nx

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from src.dataset.dataset_info import datasets
from src.graph.graph_measures import calculate_graph_measures

In [2]:
name = "cic_ids_2017_5_percent"
# name = "cic_ton_iot_5_percent"
# name = "cic_ton_iot"
# name = "cic_ids_2017"
# name = "nf_bot_iot"
# name = "edge_iiot"
# name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
# name = "nf_uq_nids"
# name = "x_iiot"
# name = "cic_ton_iot_modified"
# name = "nf_ton_iotv2_modified"
# name = "ccd_inid_modified"
# name = "nf_uq_nids_modified"

dataset = datasets[name]

In [3]:
df = pd.read_parquet(dataset.path)

In [4]:
df.head()

Unnamed: 0_level_0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Attack,Class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
66292,192.168.10.1-192.168.10.3-53-60671-17,192.168.10.3,60671.0,192.168.10.1,53.0,17.0,03/07/2017 10:23:37,30919.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0
420414,192.168.10.8-52.84.64.212-51938-443-6,52.84.64.212,443.0,192.168.10.8,51938.0,6.0,03/07/2017 11:20:18,3.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0
393869,172.217.10.2-192.168.10.8-80-55154-6,192.168.10.8,55154.0,172.217.10.2,80.0,6.0,5/7/2017 1:53,115525809.0,16.0,14.0,...,140027.3781,501173.0,36578.0,9976579.636,58066.24315,9994305.0,9801504.0,0,BENIGN,0
319307,192.168.10.1-192.168.10.3-53-61248-17,192.168.10.3,61248.0,192.168.10.1,53.0,17.0,03/07/2017 04:15:49,60594.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0
419002,192.168.10.3-192.168.10.12-53-32968-17,192.168.10.12,32968.0,192.168.10.3,53.0,17.0,03/07/2017 09:40:12,310.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0


In [5]:
df.Label.value_counts()

Label
0    113501
1     27754
Name: count, dtype: int64

In [6]:
y = df[dataset.label_col]
X_train, X_test, y_train, y_test = train_test_split(
        df, y, test_size=0.3, random_state=13, stratify=y)

del df

In [7]:
X_train.shape

(98878, 85)

In [8]:
X_test.shape

(42377, 85)

In [9]:
cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns ))  - set(list([dataset.label_col])) - set(dataset.drop_columns)  - set(dataset.weak_columns))

In [10]:
scaler = StandardScaler()
X_train[cols_to_norm] = scaler.fit_transform(X_train[cols_to_norm])

X_train['h'] = X_train[ cols_to_norm ].values.tolist()

In [11]:
G = nx.from_pandas_edgelist(X_train, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col], create_using=nx.MultiDiGraph())

In [12]:
# get netowrk properties
graph_measures = calculate_graph_measures(G, "datasets/" + name + "/training_graph_measures.json")
graph_measures

{'number_of_nodes': 8839,
 'number_of_edges': 98878,
 'max_degree': 34367,
 'avg_degree': 22.37311913112343,
 'density': 0.0012657342798779944,
 'number_of_communities': 17,
 'mixing_parameter': 0.5978478529096463,
 'modularity': 0.07888858775104314}

In [13]:
graph_measures = calculate_graph_measures(nx.DiGraph(G), "datasets/" + name + "/training_graph_simple_measures.json")
graph_measures

{'number_of_nodes': 8839,
 'number_of_edges': 21329,
 'max_degree': 2386,
 'avg_degree': 4.826111551080439,
 'transitivity': 0.0011424845599242706,
 'density': 0.00027303188227429504,
 'number_of_communities': 15,
 'mixing_parameter': 0.4944910684982887,
 'modularity': 0.4151360637822604}

In [14]:
with open("datasets/" + name + "/training_graph.pkl", "wb") as f:
    pickle.dump(G, f)

In [15]:
# nx.write_gexf(G, "datasets/cic_ton_iot/training_graph.gexf")

In [16]:
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])

In [17]:
X_test['h'] = X_test[ cols_to_norm ].values.tolist()

In [18]:
G_test = nx.from_pandas_edgelist(X_test, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col],create_using=nx.MultiDiGraph())

In [19]:
graph_measures = calculate_graph_measures(G_test, "datasets/" + name + "/testing_graph_measures.json")
graph_measures

{'number_of_nodes': 5631,
 'number_of_edges': 42377,
 'max_degree': 14695,
 'avg_degree': 15.051323033209021,
 'density': 0.0013367071965549753,
 'number_of_communities': 18,
 'mixing_parameter': 0.5614366283597234,
 'modularity': 0.247284465004819}

In [20]:
graph_measures = calculate_graph_measures(nx.DiGraph(G_test), "datasets/" + name + "/testing_graph_simple_measures.json")
graph_measures

{'number_of_nodes': 5631,
 'number_of_edges': 11336,
 'max_degree': 1228,
 'avg_degree': 4.026283075830226,
 'transitivity': 0.0016106360907701467,
 'density': 0.0003575739854200911,
 'number_of_communities': 14,
 'mixing_parameter': 0.4296047988708539,
 'modularity': 0.47940918920318815}

In [21]:
with open("datasets/" + name + "/testing_graph.pkl", "wb") as f:
    pickle.dump(G_test, f)

In [22]:
# nx.write_gexf(G_test, "datasets/cic_ton_iot/testing_graph.gexf")