In [1]:
import pandas as pd
import os
import pickle

import networkx as nx

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from src.dataset.dataset_info import datasets
from src.graph.graph_measures import calculate_graph_measures
from src.graph.graph_construction.window_graph import create_weightless_window_graph
from src.graph.graph_construction.session_graph import define_sessions, create_weightless_session_graph
from src.graph.centralities.add_centralities import add_centralities

In [2]:
flow_graph = True

with_centralities = True

cn_measures = ["betweenness", "degree", "pagerank"]
network_features = ['src_betweenness', 'dst_betweenness',
                    'src_degree', 'dst_degree', 'src_pagerank', 'dst_pagerank']

session_graph = False

window_graph = False
window_size= 2000

multi_graph=False
line_graph = False
sort_timestamp = False

In [3]:
# name = "cic_ton_iot_5_percent"
name = "cic_ton_iot"
# name = "cic_ids_2017_5_percent"
# name = "cic_ids_2017"
# name = "cic_bot_iot"
# name = "cic_ton_iot_modified"
# name = "nf_ton_iotv2_modified"
# name = "ccd_inid_modified"
# name = "nf_uq_nids_modified"
# name = "edge_iiot"
# name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
# name = "nf_uq_nids"
# name = "x_iiot"

dataset = datasets[name]

In [4]:
df = pd.read_parquet(dataset.path)

In [5]:
df.head()

Unnamed: 0_level_0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Attack,Class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,177.30.87.144-192.168.1.1-0-0-0,177.30.87.144,0.0,192.168.1.1,0.0,0.0,25/04/2019 05:18:52 pm,47814343.0,5.0,0.0,...,0.0,1038036.0,1038036.0,518725600000000.0,898459000000000.0,1556177000000000.0,16573240.0,0,Benign,0
1,167.49.176.28-50.165.192.168-0-0-0,167.49.176.28,0.0,50.165.192.168,0.0,0.0,25/04/2019 05:18:49 pm,2033142.0,2.0,0.0,...,0.0,0.0,0.0,1556177000000000.0,0.0,1556177000000000.0,1556177000000000.0,0,Benign,0
2,230.158.52.59-177.21.192.168-0-0-0,230.158.52.59,0.0,177.21.192.168,0.0,0.0,25/04/2019 05:18:37 pm,82877133.0,14.0,0.0,...,1711593.0,3942470.0,226402.0,172908500000000.0,518725600000000.0,1556177000000000.0,6036493.0,0,Benign,0
3,183.68.192.168-1.1.192.168-0-0-0,183.68.192.168,0.0,1.1.192.168,0.0,0.0,25/04/2019 05:18:42 pm,24359.0,2.0,0.0,...,0.0,0.0,0.0,1556177000000000.0,0.0,1556177000000000.0,1556177000000000.0,0,Benign,0
4,183.41.192.168-1.1.192.168-0-0-0,183.41.192.168,0.0,1.1.192.168,0.0,0.0,25/04/2019 05:18:42 pm,10239351.0,3.0,0.0,...,0.0,4053975.0,4053975.0,778088400000000.0,1100383000000000.0,1556177000000000.0,6185376.0,0,Benign,0


In [6]:
cols_to_norm = set(list(df.columns))  - set(list([dataset.label_col, dataset.class_num_col])) - set(dataset.drop_columns)  - set(dataset.weak_columns)
if with_centralities:
    cols_to_norm_with_centralities = list(cols_to_norm | set(network_features))
cols_to_norm = list(cols_to_norm)
cols_to_norm

['ECE Flag Cnt',
 'Down/Up Ratio',
 'Bwd IAT Min',
 'Idle Min',
 'Init Fwd Win Byts',
 'Fwd Header Len',
 'Bwd Pkts/s',
 'Idle Std',
 'Bwd IAT Max',
 'Subflow Fwd Pkts',
 'Fwd IAT Max',
 'Active Min',
 'Fwd Pkts/s',
 'Subflow Bwd Byts',
 'Fwd Pkt Len Min',
 'Pkt Len Min',
 'RST Flag Cnt',
 'Fwd Act Data Pkts',
 'Subflow Fwd Byts',
 'Protocol',
 'Init Bwd Win Byts',
 'Flow IAT Std',
 'Tot Fwd Pkts',
 'ACK Flag Cnt',
 'Bwd Pkts/b Avg',
 'SYN Flag Cnt',
 'Bwd Blk Rate Avg',
 'Fwd IAT Min',
 'Flow Byts/s',
 'Bwd IAT Std',
 'Bwd Byts/b Avg',
 'Active Max',
 'Fwd PSH Flags',
 'Bwd Pkt Len Min',
 'Idle Max',
 'Fwd Seg Size Min',
 'TotLen Fwd Pkts']

In [7]:
df[dataset.label_col].value_counts()

Label
1    2836524
0    2514059
Name: count, dtype: int64

In [8]:
if sort_timestamp:
    df[dataset.timestamp_col] = pd.to_datetime(df[dataset.timestamp_col].str.strip(), format=dataset.timestamp_format)
    df.sort_values(dataset.timestamp_col, inplace=True)

In [9]:
if window_graph and line_graph:
    
    df.reset_index(drop=True, inplace=True)
    df.reset_index(drop=False, inplace=True)
    
    scaler = StandardScaler()

    df[cols_to_norm] = scaler.fit_transform(df[cols_to_norm])
    df['h'] = df[ cols_to_norm ].values.tolist()
    
    folder_path="datasets/" + name + "/line_graph_unsorted"
    if sort_timestamp:
        folder_path="datasets/" + name + "/line_graph_sorted"
        
    create_weightless_window_graph(
        df=df,
        src_ip_col=dataset.src_ip_col,
        dst_ip_col=dataset.dst_ip_col,
        window_size=window_size,
        line_graph=True,
        folder_path=folder_path,
        # folder_path=None,
        # test_percentage = 20,
        edge_attr= ['h',dataset.label_col, "index"],
        file_type="pkl")    

In [10]:
if session_graph:
    folder_path="datasets/" + name + "/session_graphs"
        
    df2 = define_sessions(df,
                          src_ip_col=dataset.src_ip_col,
                          src_port_col=dataset.src_port_col,
                          dst_ip_col=dataset.dst_ip_col,
                          dst_port_col=dataset.dst_port_col,
                        #   protocol_col=dataset.
    )
    
    create_weightless_session_graph(df,
                                    src_ip_col = dataset.src_ip_col,
                                    dst_ip_col = dataset.dst_ip_col,
                                    multi_graph=multi_graph,
                                    line_graph=line_graph,
                                    folder_path=folder_path)

In [11]:
if flow_graph:
	y = df[dataset.label_col]
 
	if sort_timestamp:
		X_train, X_test, y_train, y_test = train_test_split(
			df, y, test_size=0.3)
	else:
		X_train, X_test, y_train, y_test = train_test_split(
			df, y, test_size=0.3, random_state=13, stratify=y)

	del df

	print(f"==>> X_train.shape: {X_train.shape}")
	print(f"==>> X_train.shape: {X_test.shape}")

	scaler = StandardScaler()

 
	if with_centralities:
		add_centralities(X_train, new_path=None, graph_path=None, dataset=dataset, cn_measures=cn_measures, network_features=network_features, create_using=nx.MultiDiGraph())
		add_centralities(X_test, new_path=None, graph_path=None, dataset=dataset, cn_measures=cn_measures, network_features=network_features, create_using=nx.MultiDiGraph())
		print(f"==>> after add_centralities:")
		print(f"==>> X_train.shape: {X_train.shape}")
		print(f"==>> X_train.shape: {X_test.shape}")
 
  
	if with_centralities:
		X_train[cols_to_norm_with_centralities] = scaler.fit_transform(X_train[cols_to_norm_with_centralities])
		X_train['h'] = X_train[ cols_to_norm_with_centralities ].values.tolist()
	else:
		X_train[cols_to_norm] = scaler.fit_transform(X_train[cols_to_norm])
		X_train['h'] = X_train[ cols_to_norm ].values.tolist()

	cols_to_drop = list(set(list(X_train.columns)) - set(list([dataset.label_col, dataset.src_ip_col, dataset.dst_ip_col, 'h'])))
	X_train.drop(cols_to_drop, axis=1, inplace=True)

	if with_centralities:
		X_test[cols_to_norm_with_centralities] = scaler.transform(X_test[cols_to_norm_with_centralities])
		X_test['h'] = X_test[ cols_to_norm_with_centralities ].values.tolist()
	else:
		X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
		X_test['h'] = X_test[ cols_to_norm ].values.tolist()

	X_test.drop(cols_to_drop, axis=1, inplace=True)

==>> X_train.shape: (3745408, 85)
==>> X_train.shape: (1605175, 85)
calculated betweenness
calculated degree
calculated pagerank
==>> features_dicts: ('betweenness', 125476)
==>> features_dicts: ('degree', 125476)
==>> features_dicts: ('pagerank', 125476)
calculated betweenness
calculated degree
calculated pagerank
==>> features_dicts: ('betweenness', 85708)
==>> features_dicts: ('degree', 85708)
==>> features_dicts: ('pagerank', 85708)
==>> after add_centralities:
==>> X_train.shape: (3745408, 91)
==>> X_train.shape: (1605175, 91)


In [12]:
if flow_graph:
    X_train.columns

In [13]:
# cols_to_drop = list(set(list(X_train.columns)) - set(list([dataset.label_col, dataset.src_ip_col, dataset.dst_ip_col, 'h'])))
# cols_to_drop

In [14]:
if flow_graph:
    graph_name = "training_graph"
    if with_centralities:
        graph_name = "training_graph_with_centralities"

    G = nx.from_pandas_edgelist(X_train, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col], create_using=nx.MultiDiGraph())
    # get netowrk properties
    graph_measures = calculate_graph_measures(G, f"datasets/{name}/{graph_name}_measures.json", verbose=True)
    print(f"==>> graph_measures: {graph_measures}")

    # graph_measures = calculate_graph_measures(nx.DiGraph(G), "datasets/" + name + "/training_graph_simple_measures.json", verbose=True)
    # print(f"==>> graph_measures: {graph_measures}")

    with open(f"datasets/{name}/{graph_name}.pkl", "wb") as f:
        pickle.dump(G, f)

==>> calculated degrees, in 0.491796700283885 seconds
==>> graph_measures: {'number_of_nodes': 125476, 'number_of_edges': 3745408, 'max_degree': 1446507, 'avg_degree': 59.69919347126144, 'density': 0.00023789278131604479}


In [15]:
if flow_graph:
    graph_name = "testing_graph"
    if with_centralities:
        graph_name = "testing_graph_with_centralities"

    G_test = nx.from_pandas_edgelist(X_test, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col],create_using=nx.MultiDiGraph())
    
    graph_measures = calculate_graph_measures(G_test, f"datasets/{name}/{graph_name}_measures.json", verbose=True)
    print(f"==>> graph_measures: {graph_measures}")
    
    # graph_measures = calculate_graph_measures(nx.DiGraph(G_test), "datasets/" + name + "/testing_graph_simple_measures.json", verbose=True)
    # print(f"==>> graph_measures: {graph_measures}")
    
    with open(f"datasets/{name}/{graph_name}.pkl", "wb") as f:
        pickle.dump(G_test, f)

==>> calculated degrees, in 0.2864442002028227 seconds
==>> graph_measures: {'number_of_nodes': 85708, 'number_of_edges': 1605175, 'max_degree': 620281, 'avg_degree': 37.45683016754562, 'density': 0.00021851674990109105}


In [16]:
# if window_graph and line_graph:

#     create_weightless_window_graph(
#         df=X_test,
#         src_ip_col=dataset.src_ip_col,
#         dst_ip_col=dataset.dst_ip_col,
#         window_size=window_size,
#         line_graph=True,
#         folder_path="datasets/" + name + "/line_graph/testing",
#         file_type="pkl")

In [17]:
# nx.write_gexf(G_test, "datasets/cic_ton_iot/testing_graph.gexf")