In [1]:
import pandas as pd
import os
import pickle

import networkx as nx

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from src.dataset.dataset_info import datasets
from src.graph.graph_measures import calculate_graph_measures
from src.graph.graph_construction.window_graph import create_weightless_window_graph
from src.graph.graph_construction.session_graph import define_sessions, create_weightless_session_graph
from src.graph.centralities.add_centralities import add_centralities

In [2]:
flow_graph = True
with_centralities = True

cn_measures = ["betweenness", "degree", "pagerank"]
network_features = ['src_betweenness', 'dst_betweenness',
                    'src_degree', 'dst_degree', 'src_pagerank', 'dst_pagerank']

session_graph = False

window_graph = False
window_size= 2000

multi_graph=False
line_graph = False
sort_timestamp = False

In [3]:
# name = "cic_ton_iot_5_percent"
# name = "cic_ton_iot"
# name = "cic_ids_2017_5_percent"
name = "cic_ids_2017"
# name = "cic_bot_iot"
# name = "cic_ton_iot_modified"
# name = "nf_ton_iotv2_modified"
# name = "ccd_inid_modified"
# name = "nf_uq_nids_modified"
# name = "edge_iiot"
# name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
# name = "nf_uq_nids"
# name = "x_iiot"

dataset = datasets[name]

In [4]:
df = pd.read_parquet(dataset.path)

In [5]:
df.head()

Unnamed: 0_level_0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Attack,Class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,192.168.10.5-104.16.207.165-54865-443-6,104.16.207.165,443.0,192.168.10.5,54865.0,6.0,7/7/2017 3:30,3.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0
1,192.168.10.5-104.16.28.216-55054-80-6,104.16.28.216,80.0,192.168.10.5,55054.0,6.0,7/7/2017 3:30,109.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0
2,192.168.10.5-104.16.28.216-55055-80-6,104.16.28.216,80.0,192.168.10.5,55055.0,6.0,7/7/2017 3:30,52.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0
3,192.168.10.16-104.17.241.25-46236-443-6,104.17.241.25,443.0,192.168.10.16,46236.0,6.0,7/7/2017 3:30,34.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0
4,192.168.10.5-104.19.196.102-54863-443-6,104.19.196.102,443.0,192.168.10.5,54863.0,6.0,7/7/2017 3:30,3.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,BENIGN,0


In [6]:
cols_to_norm = set(list(df.columns))  - set(list([dataset.label_col, dataset.class_num_col])) - set(dataset.drop_columns)  - set(dataset.weak_columns)
if with_centralities:
    cols_to_norm |= set(network_features)
cols_to_norm = list(cols_to_norm)
cols_to_norm

['dst_betweenness',
 'Flow Byts/s',
 'Bwd IAT Min',
 'PSH Flag Cnt',
 'Flow IAT Min',
 'Idle Std',
 'Fwd Seg Size Min',
 'src_pagerank',
 'Bwd Pkts/s',
 'Subflow Fwd Byts',
 'ACK Flag Cnt',
 'dst_degree',
 'Bwd IAT Tot',
 'dst_pagerank',
 'Active Min',
 'src_degree',
 'Bwd Pkt Len Min',
 'Pkt Len Min',
 'Bwd IAT Max',
 'FIN Flag Cnt',
 'CWE Flag Count',
 'Fwd Header Len',
 'src_betweenness',
 'Init Bwd Win Byts',
 'SYN Flag Cnt',
 'Bwd Seg Size Avg',
 'Active Max',
 'URG Flag Cnt',
 'Down/Up Ratio',
 'Init Fwd Win Byts',
 'Fwd Pkt Len Min',
 'Fwd Seg Size Avg',
 'Bwd Header Len',
 'Fwd Act Data Pkts',
 'ECE Flag Cnt',
 'Fwd Pkts/s',
 'Idle Min']

In [7]:
df[dataset.label_col].value_counts()

Label
0    2265910
1     548968
Name: count, dtype: int64

In [8]:
if sort_timestamp:
    df[dataset.timestamp_col] = pd.to_datetime(df[dataset.timestamp_col].str.strip(), format=dataset.timestamp_format)
    df.sort_values(dataset.timestamp_col, inplace=True)

In [9]:
if window_graph and line_graph:
    
    df.reset_index(drop=True, inplace=True)
    df.reset_index(drop=False, inplace=True)
    
    scaler = StandardScaler()

    df[cols_to_norm] = scaler.fit_transform(df[cols_to_norm])
    df['h'] = df[ cols_to_norm ].values.tolist()
    
    folder_path="datasets/" + name + "/line_graph_unsorted"
    if sort_timestamp:
        folder_path="datasets/" + name + "/line_graph_sorted"
        
    create_weightless_window_graph(
        df=df,
        src_ip_col=dataset.src_ip_col,
        dst_ip_col=dataset.dst_ip_col,
        window_size=window_size,
        line_graph=True,
        folder_path=folder_path,
        # folder_path=None,
        # test_percentage = 20,
        edge_attr= ['h',dataset.label_col, "index"],
        file_type="pkl")    

In [10]:
if session_graph:
    folder_path="datasets/" + name + "/session_graphs"
        
    df2 = define_sessions(df,
                          src_ip_col=dataset.src_ip_col,
                          src_port_col=dataset.src_port_col,
                          dst_ip_col=dataset.dst_ip_col,
                          dst_port_col=dataset.dst_port_col,
                        #   protocol_col=dataset.
    )
    
    create_weightless_session_graph(df,
                                    src_ip_col = dataset.src_ip_col,
                                    dst_ip_col = dataset.dst_ip_col,
                                    multi_graph=multi_graph,
                                    line_graph=line_graph,
                                    folder_path=folder_path)

In [11]:
if not window_graph:
	y = df[dataset.label_col]
 
	if sort_timestamp:
		X_train, X_test, y_train, y_test = train_test_split(
			df, y, test_size=0.3)
	else:
		X_train, X_test, y_train, y_test = train_test_split(
			df, y, test_size=0.3, random_state=13, stratify=y)

	del df

	print(f"==>> X_train.shape: {X_train.shape}")
	print(f"==>> X_train.shape: {X_test.shape}")

	scaler = StandardScaler()
 
	add_centralities(X_train, new_path=None, graph_path=None, dataset=dataset, cn_measures=cn_measures, network_features=network_features)
	add_centralities(X_test, new_path=None, graph_path=None, dataset=dataset, cn_measures=cn_measures, network_features=network_features)
  

	print(f"==>> after add_centralities:")
	print(f"==>> X_train.shape: {X_train.shape}")
	print(f"==>> X_train.shape: {X_test.shape}")
 
	X_train[cols_to_norm] = scaler.fit_transform(X_train[cols_to_norm])
	X_train['h'] = X_train[ cols_to_norm ].values.tolist()

	X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
	X_test['h'] = X_test[ cols_to_norm ].values.tolist()

==>> X_train.shape: (1970414, 85)
==>> X_train.shape: (844464, 85)
calculated betweenness
calculated degree
calculated pagerank
==>> features_dicts: ('betweenness', 18890)
==>> features_dicts: ('degree', 18890)
==>> features_dicts: ('pagerank', 18890)
calculated betweenness
calculated degree
calculated pagerank
==>> features_dicts: ('betweenness', 17175)
==>> features_dicts: ('degree', 17175)
==>> features_dicts: ('pagerank', 17175)
==>> after add_centralities:
==>> X_train.shape: (1970414, 91)
==>> X_train.shape: (844464, 91)


In [12]:
if flow_graph:
    G = nx.from_pandas_edgelist(X_train, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col], create_using=nx.MultiDiGraph())


    # get netowrk properties
    graph_measures = calculate_graph_measures(G, "datasets/" + name + "/training_graph_with_centralities_measures.json", verbose=True)
    print(f"==>> graph_measures: {graph_measures}")

    # graph_measures = calculate_graph_measures(nx.DiGraph(G), "datasets/" + name + "/training_graph_with_centralities_simple_measures.json", verbose=True)
    # print(f"==>> graph_measures: {graph_measures}")

    with open("datasets/" + name + "/training_graph_with_centralities.pkl", "wb") as f:
        pickle.dump(G, f)

==>> calculated degrees, in 0.0650337003171444 seconds
==>> graph_measures: {'number_of_nodes': 18890, 'number_of_edges': 1970414, 'max_degree': 688788, 'avg_degree': 208.61979883536262, 'density': 0.0055222563088401354}


In [13]:
if flow_graph:
    G_test = nx.from_pandas_edgelist(X_test, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col],create_using=nx.MultiDiGraph())
    
    graph_measures = calculate_graph_measures(G_test, "datasets/" + name + "/testing_graph_with_centralities_measures.json", verbose=True)
    print(f"==>> graph_measures: {graph_measures}")
    
    # graph_measures = calculate_graph_measures(nx.DiGraph(G_test), "datasets/" + name + "/testing_graph_with_centralities_simple_measures.json", verbose=True)
    # print(f"==>> graph_measures: {graph_measures}")
    
    with open("datasets/" + name + "/testing_graph_with_centralities.pkl", "wb") as f:
        pickle.dump(G_test, f)

==>> calculated degrees, in 0.05383920110762119 seconds
==>> graph_measures: {'number_of_nodes': 17175, 'number_of_edges': 844464, 'max_degree': 295207, 'avg_degree': 98.3364192139738, 'density': 0.0028629445444850877}


In [14]:
# if window_graph and line_graph:

#     create_weightless_window_graph(
#         df=X_test,
#         src_ip_col=dataset.src_ip_col,
#         dst_ip_col=dataset.dst_ip_col,
#         window_size=window_size,
#         line_graph=True,
#         folder_path="datasets/" + name + "/line_graph/testing",
#         file_type="pkl")

In [15]:
# nx.write_gexf(G_test, "datasets/cic_ton_iot/testing_graph.gexf")