In [39]:
import os
import pickle
import random
import socket
import struct

import networkx as nx
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from src.dataset.dataset_info import datasets
from src.graph.graph_construction.window_graph import create_weightless_window_graph
from src.graph.graph_measures import calculate_graph_measures


In [40]:
multi_class = True

use_port_in_address = True

generated_ips = True

graph_type = "flow"
# graph_type = "window"
# graph_type = "line"

window_size= 10000

sort_timestamp = False

# k_fold = None
# k_fold = 5

validation_size = 0.1
test_size = 0.1

In [41]:
# name = "cic_ton_iot_5_percent"
# name = "cic_ton_iot"
# name = "cic_ids_2017_5_percent"
# name = "cic_ids_2017"
# name = "cic_bot_iot"
# name = "cic_ton_iot_modified"
# name = "nf_ton_iotv2_modified"
name = "ccd_inid_modified"
# name = "nf_uq_nids_modified"
# name = "edge_iiot"
# name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
# name = "nf_uq_nids"
# name = "x_iiot"

dataset = datasets[name]

In [42]:
g_type = ""
if graph_type == "flow":
    g_type = "flow"
elif graph_type == "line":
    g_type = f"line_graph_{window_size}"
elif graph_type == "window":
    g_type = f"window_graph_{window_size}"
    
if multi_class:
    g_type += "__multi_class"
    
# if k_fold:
#     g_type += f"__{k_fold}_fold"
    
if use_port_in_address:
    g_type += "__ports"
    
if generated_ips:
    g_type += "__generated_ips"
    
if sort_timestamp:
    g_type += "__sorted"
else:
    g_type += "__unsorted"
    
folder_path = os.path.join("datasets",name, g_type)
# folder_path = f"datasets/{name}/{g_type}"
folder_path

'datasets\\ccd_inid_modified\\flow__multi_class__ports__generated_ips__unsorted'

In [43]:
df = pd.read_parquet(dataset.path)

In [44]:
df.head()

Unnamed: 0_level_0,Unnamed: 0,id,expiration_id,src_ip,src_ip_is_private,src_port,dst_ip,dst_ip_is_private,dst_port,protocol,...,dst2src_fin_packets,splt_direction,splt_ps,splt_piat_ms,application_name,application_category_name,application_is_guessed,atk_type,traffic_type,Class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,192.168.0.11,1,5555,192.168.0.26,1,54128,6,...,0,363,3926,18618,45,14,0,mitm,1,2
2,2,2,0,192.168.0.11,1,5353,224.0.0.251,0,5353,17,...,0,3,4310,14001,24,9,1,mitm,1,2
6,6,6,0,192.168.0.11,1,5353,224.0.0.251,0,5353,17,...,0,0,7218,0,24,9,0,mitm,1,2
8,8,8,1,192.168.0.26,1,54128,192.168.0.11,1,5555,6,...,0,414,6455,187,45,14,0,mitm,1,2
10,10,10,0,192.168.0.11,1,5353,224.0.0.251,0,5353,17,...,0,0,7218,0,24,9,0,mitm,1,2


In [45]:
cols_to_norm = list(set(list(df.columns))  - set(list([dataset.label_col, dataset.class_num_col])) - set(dataset.drop_columns)  - set(dataset.weak_columns))
cols_to_norm

['src2dst_last_seen_ms',
 'dst2src_syn_packets',
 'application_category_name',
 'dst2src_max_piat_ms',
 'dst2src_rst_packets',
 'src2dst_max_piat_ms',
 'dst2src_psh_packets',
 'src2dst_fin_packets',
 'src2dst_ack_packets',
 'src2dst_stddev_piat_ms',
 'src2dst_stddev_ps',
 'src2dst_max_ps',
 'src2dst_psh_packets',
 'bidirectional_min_ps',
 'dst2src_min_ps',
 'dst2src_fin_packets',
 'src2dst_min_ps',
 'src2dst_mean_piat_ms',
 'src2dst_rst_packets',
 'dst2src_ack_packets',
 'application_is_guessed',
 'protocol',
 'dst2src_mean_piat_ms',
 'bidirectional_max_ps']

In [46]:
df[dataset.label_col].value_counts()

traffic_type
0    33413
1    30786
Name: count, dtype: int64

In [47]:
if generated_ips:
    df[dataset.src_ip_col] = df[dataset.src_ip_col].apply(lambda x: socket.inet_ntoa(struct.pack('>I', random.randint(0xac100001, 0xac1f0001))))

In [48]:
if sort_timestamp:
    df[dataset.timestamp_col] = pd.to_datetime(df[dataset.timestamp_col].str.strip(), format=dataset.timestamp_format)
    df.sort_values(dataset.timestamp_col, inplace=True)

In [49]:
if use_port_in_address:
    df[dataset.src_port_col] = df[dataset.src_port_col].astype(float).astype(int).astype(str) # to remove the decimal point
    df[dataset.src_ip_col] = df[dataset.src_ip_col] + ':' + df[dataset.src_port_col]

    df[dataset.dst_port_col] = df[dataset.dst_port_col].astype(float).astype(int).astype(str) # to remove the decimal point
    df[dataset.dst_ip_col] = df[dataset.dst_ip_col] + ':' + df[dataset.dst_port_col]

In [50]:
df.head()

Unnamed: 0_level_0,Unnamed: 0,id,expiration_id,src_ip,src_ip_is_private,src_port,dst_ip,dst_ip_is_private,dst_port,protocol,...,dst2src_fin_packets,splt_direction,splt_ps,splt_piat_ms,application_name,application_category_name,application_is_guessed,atk_type,traffic_type,Class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1,172.17.227.16:5555,1,5555,192.168.0.26:54128,1,54128,6,...,0,363,3926,18618,45,14,0,mitm,1,2
2,2,2,0,172.30.196.182:5353,1,5353,224.0.0.251:5353,0,5353,17,...,0,3,4310,14001,24,9,1,mitm,1,2
6,6,6,0,172.23.164.28:5353,1,5353,224.0.0.251:5353,0,5353,17,...,0,0,7218,0,24,9,0,mitm,1,2
8,8,8,1,172.17.104.186:54128,1,54128,192.168.0.11:5555,1,5555,6,...,0,414,6455,187,45,14,0,mitm,1,2
10,10,10,0,172.23.206.130:5353,1,5353,224.0.0.251:5353,0,5353,17,...,0,0,7218,0,24,9,0,mitm,1,2


In [51]:
if multi_class:
    y = df[dataset.class_num_col]
else:
    y = df[dataset.label_col]

if sort_timestamp:
    X_tr, X_test, y_tr, y_test = train_test_split(
        df, y, test_size=test_size)
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_tr, y_tr, test_size=validation_size)
else:
    X_tr, X_test, y_tr, y_test = train_test_split(
        df, y, test_size=test_size, random_state=13, stratify=y)
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_tr, y_tr, test_size=validation_size, random_state=13, stratify=y_tr)

del df

In [52]:
scaler = StandardScaler()

X_train[cols_to_norm] = scaler.fit_transform(X_train[cols_to_norm])
X_train['h'] = X_train[ cols_to_norm ].values.tolist()

cols_to_drop = list(set(list(X_train.columns)) - set(list([dataset.label_col, dataset.src_ip_col, dataset.dst_ip_col, dataset.class_num_col, 'h'])))
X_train.drop(cols_to_drop, axis=1, inplace=True)

X_val[cols_to_norm] = scaler.transform(X_val[cols_to_norm])
X_val['h'] = X_val[ cols_to_norm ].values.tolist()
X_val.drop(cols_to_drop, axis=1, inplace=True)

X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
X_test['h'] = X_test[ cols_to_norm ].values.tolist()
X_test.drop(cols_to_drop, axis=1, inplace=True)

In [53]:
if graph_type == "window" or graph_type == "line":
    create_weightless_window_graph(
        df=X_train,
        src_ip_col=dataset.src_ip_col,
        dst_ip_col=dataset.dst_ip_col,
        window_size=window_size,
        line_graph=graph_type == "line",
        folder_path=os.path.join(folder_path, "training"),
        edge_attr= ['h', dataset.label_col, dataset.class_num_col],
        file_type="pkl")
    
    create_weightless_window_graph(
        df=X_val,
        src_ip_col=dataset.src_ip_col,
        dst_ip_col=dataset.dst_ip_col,
        window_size=window_size,
        line_graph=graph_type == "line",
        folder_path=os.path.join(folder_path, "validation"),
        edge_attr= ['h', dataset.label_col, dataset.class_num_col],
        file_type="pkl")
    
    create_weightless_window_graph(
        df=X_test,
        src_ip_col=dataset.src_ip_col,
        dst_ip_col=dataset.dst_ip_col,
        window_size=window_size,
        line_graph=graph_type == "line",
        folder_path=os.path.join(folder_path, "testing"),
        edge_attr= ['h', dataset.label_col, dataset.class_num_col],
        file_type="pkl")

In [54]:
if graph_type == "flow":
	os.makedirs(folder_path, exist_ok=True)
	print(f"==>> X_train.shape: {X_train.shape}")
	print(f"==>> X_train.shape: {X_val.shape}")
	print(f"==>> X_train.shape: {X_test.shape}")


==>> X_train.shape: (52001, 5)
==>> X_train.shape: (5778, 5)
==>> X_train.shape: (6420, 5)


In [55]:
if graph_type == "flow":
    graph_name = "training_graph"

    G = nx.from_pandas_edgelist(X_train, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col, dataset.class_num_col], create_using=nx.MultiDiGraph())
    # get netowrk properties
    graph_measures = calculate_graph_measures(G, f"{folder_path}/{graph_name}_measures.json", verbose=True)
    print(f"==>> graph_measures: {graph_measures}")

    # graph_measures = calculate_graph_measures(nx.DiGraph(G), "datasets/" + name + "/training_graph_simple_measures.json", verbose=True)
    # print(f"==>> graph_measures: {graph_measures}")

    with open(f"{folder_path}/{graph_name}.pkl", "wb") as f:
        pickle.dump(G, f)

==>> calculated degrees, in 0.07022510096430779 seconds
==>> graph_measures: {'number_of_nodes': 52983, 'number_of_edges': 52001, 'max_degree': 9950, 'avg_degree': 1.9629315063322197, 'density': 1.8524513101923482e-05}


In [56]:
if graph_type == "flow":
    graph_name = "validation_graph"

    G = nx.from_pandas_edgelist(X_val, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col, dataset.class_num_col], create_using=nx.MultiDiGraph())
    # get netowrk properties
    graph_measures = calculate_graph_measures(G, f"{folder_path}/{graph_name}_measures.json", verbose=True)
    print(f"==>> graph_measures: {graph_measures}")

    # graph_measures = calculate_graph_measures(nx.DiGraph(G), "datasets/" + name + "/training_graph_simple_measures.json", verbose=True)
    # print(f"==>> graph_measures: {graph_measures}")

    with open(f"{folder_path}/{graph_name}.pkl", "wb") as f:
        pickle.dump(G, f)

==>> calculated degrees, in 0.007774699479341507 seconds
==>> graph_measures: {'number_of_nodes': 6112, 'number_of_edges': 5778, 'max_degree': 1084, 'avg_degree': 1.8907068062827226, 'density': 0.00015469700591414846}


In [57]:
if graph_type == "flow":
    graph_name = "testing_graph"
    
    G = nx.from_pandas_edgelist(X_test, dataset.src_ip_col, dataset.dst_ip_col, ['h', dataset.label_col, dataset.class_num_col], create_using=nx.MultiDiGraph())
    
    graph_measures = calculate_graph_measures(G, f"{folder_path}/{graph_name}_measures.json", verbose=True)
    print(f"==>> graph_measures: {graph_measures}")
    
    # graph_measures = calculate_graph_measures(nx.DiGraph(G_test), "datasets/" + name + "/testing_graph_simple_measures.json", verbose=True)
    # print(f"==>> graph_measures: {graph_measures}")
    
    with open(f"{folder_path}/{graph_name}.pkl", "wb") as f:
        pickle.dump(G, f)

==>> calculated degrees, in 0.012783598154783249 seconds
==>> graph_measures: {'number_of_nodes': 6766, 'number_of_edges': 6420, 'max_degree': 1191, 'avg_degree': 1.8977239136860775, 'density': 0.0001402604518614987}
