In [1]:
import pandas as pd
import os
import pickle

import networkx as nx

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from src.dataset.dataset_info import datasets
from src.graph.graph_measures import calculate_graph_measures
from src.graph.graph_construction.window_graph import create_weightless_window_graph
from src.graph.graph_construction.session_graph import define_sessions, create_weightless_session_graph
from src.graph.centralities.add_centralities import add_centralities

In [2]:
flow_graph = True
with_centralities = True

cn_measures = ["betweenness", "degree", "pagerank"]
network_features = ['src_betweenness', 'dst_betweenness',
                    'src_degree', 'dst_degree', 'src_pagerank', 'dst_pagerank']

session_graph = False

window_graph = False
window_size= 2000

multi_graph=False
line_graph = False
sort_timestamp = False

In [3]:
# name = "cic_ton_iot_5_percent"
# name = "cic_ton_iot"
# name = "cic_ids_2017_5_percent"
# name = "cic_ids_2017"
# name = "cic_bot_iot"
# name = "cic_ton_iot_modified"
# name = "nf_ton_iotv2_modified"
name = "ccd_inid_modified"
# name = "nf_uq_nids_modified"
# name = "edge_iiot"
# name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
# name = "nf_uq_nids"
# name = "x_iiot"

dataset = datasets[name]

In [4]:
df = pd.read_parquet(dataset.path)

In [5]:
df.head()

Unnamed: 0_level_0,Unnamed: 0,id,expiration_id,src_ip,src_ip_is_private,src_port,dst_ip,dst_ip_is_private,dst_port,protocol,...,dst2src_fin_packets,splt_direction,splt_ps,splt_piat_ms,application_name,application_category_name,application_is_guessed,atk_type,traffic_type,Class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,192.168.0.11,1,5555,192.168.0.26,1,54128,6,...,0,363,3926,18618,45,14,0,mitm,1,2
2,2,2,0,192.168.0.11,1,5353,224.0.0.251,0,5353,17,...,0,3,4310,14001,24,9,1,mitm,1,2
6,6,6,0,192.168.0.11,1,5353,224.0.0.251,0,5353,17,...,0,0,7218,0,24,9,0,mitm,1,2
8,8,8,1,192.168.0.26,1,54128,192.168.0.11,1,5555,6,...,0,414,6455,187,45,14,0,mitm,1,2
10,10,10,0,192.168.0.11,1,5353,224.0.0.251,0,5353,17,...,0,0,7218,0,24,9,0,mitm,1,2


In [6]:
cols_to_norm = set(list(df.columns))  - set(list([dataset.label_col, dataset.class_num_col])) - set(dataset.drop_columns)  - set(dataset.weak_columns)
if with_centralities:
    cols_to_norm |= set(network_features)
cols_to_norm = list(cols_to_norm)
cols_to_norm

['src2dst_stddev_piat_ms',
 'dst_betweenness',
 'src2dst_last_seen_ms',
 'dst2src_max_piat_ms',
 'dst_degree',
 'dst2src_min_ps',
 'src2dst_max_ps',
 'src2dst_mean_piat_ms',
 'src_pagerank',
 'dst2src_ack_packets',
 'src2dst_psh_packets',
 'src2dst_max_piat_ms',
 'protocol',
 'src_degree',
 'dst2src_psh_packets',
 'dst2src_rst_packets',
 'src_betweenness',
 'bidirectional_max_ps',
 'src2dst_stddev_ps',
 'dst_pagerank',
 'src2dst_ack_packets',
 'src2dst_fin_packets',
 'bidirectional_min_ps',
 'src2dst_rst_packets',
 'dst2src_fin_packets',
 'src2dst_min_ps',
 'application_is_guessed',
 'dst2src_mean_piat_ms',
 'dst2src_syn_packets',
 'application_category_name']

In [7]:
df[dataset.label_col].value_counts()

traffic_type
0    33413
1    30786
Name: count, dtype: int64

In [8]:
if sort_timestamp:
    df[dataset.timestamp_col] = pd.to_datetime(df[dataset.timestamp_col].str.strip(), format=dataset.timestamp_format)
    df.sort_values(dataset.timestamp_col, inplace=True)

In [9]:
if window_graph and line_graph:
    
    df.reset_index(drop=True, inplace=True)
    df.reset_index(drop=False, inplace=True)
    
    scaler = StandardScaler()

    df[cols_to_norm] = scaler.fit_transform(df[cols_to_norm])
    df['h'] = df[ cols_to_norm ].values.tolist()
    
    folder_path="datasets/" + name + "/line_graph_unsorted"
    if sort_timestamp:
        folder_path="datasets/" + name + "/line_graph_sorted"
        
    create_weightless_window_graph(
        df=df,
        src_ip_col=dataset.src_ip_col,
        dst_ip_col=dataset.dst_ip_col,
        window_size=window_size,
        line_graph=True,
        folder_path=folder_path,
        # folder_path=None,
        # test_percentage = 20,
        edge_attr= ['h',dataset.label_col, "index"],
        file_type="pkl")    

In [10]:
if session_graph:
    folder_path="datasets/" + name + "/session_graphs"
        
    df2 = define_sessions(df,
                          src_ip_col=dataset.src_ip_col,
                          src_port_col=dataset.src_port_col,
                          dst_ip_col=dataset.dst_ip_col,
                          dst_port_col=dataset.dst_port_col,
                        #   protocol_col=dataset.
    )
    
    create_weightless_session_graph(df,
                                    src_ip_col = dataset.src_ip_col,
                                    dst_ip_col = dataset.dst_ip_col,
                                    multi_graph=multi_graph,
                                    line_graph=line_graph,
                                    folder_path=folder_path)

In [11]:
if not window_graph:
	y = df[dataset.label_col]
 
	if sort_timestamp:
		X_train, X_test, y_train, y_test = train_test_split(
			df, y, test_size=0.3)
	else:
		X_train, X_test, y_train, y_test = train_test_split(
			df, y, test_size=0.3, random_state=13, stratify=y)

	del df

	print(f"==>> X_train.shape: {X_train.shape}")
	print(f"==>> X_train.shape: {X_test.shape}")

	scaler = StandardScaler()
 
	add_centralities(X_train, new_path=None, graph_path=None, dataset=dataset, cn_measures=cn_measures, network_features=network_features)
	add_centralities(X_test, new_path=None, graph_path=None, dataset=dataset, cn_measures=cn_measures, network_features=network_features)
  

	print(f"==>> after add_centralities:")
	print(f"==>> X_train.shape: {X_train.shape}")
	print(f"==>> X_train.shape: {X_test.shape}")
 
	X_train[cols_to_norm] = scaler.fit_transform(X_train[cols_to_norm])
	X_train['h'] = X_train[ cols_to_norm ].values.tolist()

	X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
	X_test['h'] = X_test[ cols_to_norm ].values.tolist()

==>> X_train.shape: (44939, 84)
==>> X_train.shape: (19260, 84)
calculated betweenness
calculated degree
calculated pagerank
==>> features_dicts: ('betweenness', 100)
==>> features_dicts: ('degree', 100)
==>> features_dicts: ('pagerank', 100)
calculated betweenness
calculated degree
calculated pagerank
==>> features_dicts: ('betweenness', 85)
==>> features_dicts: ('degree', 85)
==>> features_dicts: ('pagerank', 85)
==>> after add_centralities:
==>> X_train.shape: (44939, 90)
==>> X_train.shape: (19260, 90)


In [12]:
if flow_graph:
    G = nx.from_pandas_edgelist(X_train, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col], create_using=nx.MultiDiGraph())


    # get netowrk properties
    graph_measures = calculate_graph_measures(G, "datasets/" + name + "/training_graph_with_centralities_measures.json", verbose=True)
    print(f"==>> graph_measures: {graph_measures}")

    # graph_measures = calculate_graph_measures(nx.DiGraph(G), "datasets/" + name + "/training_graph_with_centralities_simple_measures.json", verbose=True)
    # print(f"==>> graph_measures: {graph_measures}")

    with open("datasets/" + name + "/training_graph_with_centralities.pkl", "wb") as f:
        pickle.dump(G, f)

==>> calculated degrees, in 0.0001411009579896927 seconds
==>> graph_measures: {'number_of_nodes': 100, 'number_of_edges': 44939, 'max_degree': 17806, 'avg_degree': 898.78, 'density': 4.539292929292929}


In [13]:
if flow_graph:
    G_test = nx.from_pandas_edgelist(X_test, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col],create_using=nx.MultiDiGraph())
    
    graph_measures = calculate_graph_measures(G_test, "datasets/" + name + "/testing_graph_with_centralities_measures.json", verbose=True)
    print(f"==>> graph_measures: {graph_measures}")
    
    # graph_measures = calculate_graph_measures(nx.DiGraph(G_test), "datasets/" + name + "/testing_graph_with_centralities_simple_measures.json", verbose=True)
    # print(f"==>> graph_measures: {graph_measures}")
    
    with open("datasets/" + name + "/testing_graph_with_centralities.pkl", "wb") as f:
        pickle.dump(G_test, f)

==>> calculated degrees, in 0.00012840144336223602 seconds
==>> graph_measures: {'number_of_nodes': 85, 'number_of_edges': 19260, 'max_degree': 7605, 'avg_degree': 453.1764705882353, 'density': 2.697478991596639}


In [14]:
# if window_graph and line_graph:

#     create_weightless_window_graph(
#         df=X_test,
#         src_ip_col=dataset.src_ip_col,
#         dst_ip_col=dataset.dst_ip_col,
#         window_size=window_size,
#         line_graph=True,
#         folder_path="datasets/" + name + "/line_graph/testing",
#         file_type="pkl")

In [15]:
# nx.write_gexf(G_test, "datasets/cic_ton_iot/testing_graph.gexf")