In [1]:
import pandas as pd
import os
import pickle

import networkx as nx

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from src.dataset.dataset_info import datasets
from src.graph.graph_measures import calculate_graph_measures

In [2]:
# name = "cic_ids_2017_5_percent"
# name = "cic_ton_iot_5_percent"
name = "cic_ton_iot"
# name = "cic_ids_2017"
# name = "nf_bot_iot"
# name = "edge_iiot"
# name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
# name = "nf_uq_nids"
# name = "x_iiot"
# name = "cic_ton_iot_modified"
# name = "nf_ton_iotv2_modified"
# name = "ccd_inid_modified"
# name = "nf_uq_nids_modified"

dataset = datasets[name]

In [3]:
df = pd.read_parquet(dataset.path)

In [4]:
df.head()

Unnamed: 0_level_0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Attack,Class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,177.30.87.144-192.168.1.1-0-0-0,177.30.87.144,0.0,192.168.1.1,0.0,0.0,25/04/2019 05:18:52 pm,47814343.0,5.0,0.0,...,0.0,1038036.0,1038036.0,518725600000000.0,898459000000000.0,1556177000000000.0,16573240.0,0,Benign,0
1,167.49.176.28-50.165.192.168-0-0-0,167.49.176.28,0.0,50.165.192.168,0.0,0.0,25/04/2019 05:18:49 pm,2033142.0,2.0,0.0,...,0.0,0.0,0.0,1556177000000000.0,0.0,1556177000000000.0,1556177000000000.0,0,Benign,0
2,230.158.52.59-177.21.192.168-0-0-0,230.158.52.59,0.0,177.21.192.168,0.0,0.0,25/04/2019 05:18:37 pm,82877133.0,14.0,0.0,...,1711593.0,3942470.0,226402.0,172908500000000.0,518725600000000.0,1556177000000000.0,6036493.0,0,Benign,0
3,183.68.192.168-1.1.192.168-0-0-0,183.68.192.168,0.0,1.1.192.168,0.0,0.0,25/04/2019 05:18:42 pm,24359.0,2.0,0.0,...,0.0,0.0,0.0,1556177000000000.0,0.0,1556177000000000.0,1556177000000000.0,0,Benign,0
4,183.41.192.168-1.1.192.168-0-0-0,183.41.192.168,0.0,1.1.192.168,0.0,0.0,25/04/2019 05:18:42 pm,10239351.0,3.0,0.0,...,0.0,4053975.0,4053975.0,778088400000000.0,1100383000000000.0,1556177000000000.0,6185376.0,0,Benign,0


In [5]:
df.Label.value_counts()

Label
1    2836524
0    2514059
Name: count, dtype: int64

In [6]:
y = df[dataset.label_col]
X_train, X_test, y_train, y_test = train_test_split(
        df, y, test_size=0.3, random_state=13, stratify=y)

del df

In [7]:
X_train.shape

(3745408, 85)

In [8]:
X_test.shape

(1605175, 85)

In [9]:
cols_to_norm = list(set(list(X_train.iloc[:, 2:].columns ))  - set(list([dataset.label_col])) - set(dataset.drop_columns)  - set(dataset.weak_columns))

In [10]:
scaler = StandardScaler()
X_train[cols_to_norm] = scaler.fit_transform(X_train[cols_to_norm])

X_train['h'] = X_train[ cols_to_norm ].values.tolist()

In [11]:
G = nx.from_pandas_edgelist(X_train, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col], create_using=nx.MultiDiGraph())

In [12]:
# get netowrk properties
graph_measures = calculate_graph_measures(G, "datasets/" + name + "/training_graph_measures.json")
graph_measures

{'number_of_nodes': 125476,
 'number_of_edges': 3745408,
 'max_degree': 1446507,
 'avg_degree': 59.69919347126144,
 'density': 0.00023789278131604479,
 'number_of_communities': 34,
 'mixing_parameter': 4.004904138614538e-05,
 'modularity': 0.17960136372494187}

In [21]:
graph_measures = calculate_graph_measures(nx.DiGraph(G), "datasets/" + name + "/training_graph_simple_measures.json")
graph_measures

In [13]:
with open("datasets/" + name + "/training_graph.pkl", "wb") as f:
    pickle.dump(G, f)

In [14]:
# nx.write_gexf(G, "datasets/cic_ton_iot/training_graph.gexf")

In [15]:
X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])

In [16]:
X_test['h'] = X_test[ cols_to_norm ].values.tolist()

In [17]:
G_test = nx.from_pandas_edgelist(X_test, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col],create_using=nx.MultiDiGraph())

In [18]:
graph_measures = calculate_graph_measures(G_test, "datasets/" + name + "/testing_graph_measures.json")
graph_measures

{'number_of_nodes': 85708,
 'number_of_edges': 1605175,
 'max_degree': 620281,
 'avg_degree': 37.45683016754562,
 'density': 0.00021851674990109105,
 'number_of_communities': 59,
 'mixing_parameter': 0.002676966685875372,
 'modularity': 0.19965916586252638}

In [None]:
graph_measures = calculate_graph_measures(nx.DiGraph(G_test), "datasets/" + name + "/testing_graph_simple_measures.json")
graph_measures

In [19]:
with open("datasets/" + name + "/testing_graph.pkl", "wb") as f:
    pickle.dump(G_test, f)

In [20]:
# nx.write_gexf(G_test, "datasets/cic_ton_iot/testing_graph.gexf")