In [1]:
import pandas as pd
import os
import pickle

import networkx as nx

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from src.dataset.dataset_info import datasets
from src.graph.graph_measures import calculate_graph_measures
from src.graph.graph_construction.window_graph import create_weightless_window_graph
from src.graph.graph_construction.session_graph import define_sessions, create_weightless_session_graph

In [2]:
flow_graph = True

session_graph = False

window_graph = False
window_size= 2000

multi_graph=False
line_graph = False
sort_timestamp = False

In [3]:
# name = "cic_ton_iot_5_percent"
# name = "cic_ton_iot"
# name = "cic_ids_2017_5_percent"
# name = "cic_ids_2017"
# name = "cic_bot_iot"
# name = "cic_ton_iot_modified"
# name = "nf_ton_iotv2_modified"
# name = "ccd_inid_modified"
# name = "nf_uq_nids_modified"
# name = "edge_iiot"
# name = "nf_cse_cic_ids2018"
# name = "nf_bot_iotv2"
# name = "nf_uq_nids"
# name = "x_iiot"

dataset = datasets[name]

In [4]:
df = pd.read_parquet(dataset.path)

In [5]:
df.head()

Unnamed: 0_level_0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack,Class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,192.168.1.193,49235,192.168.1.33,4444,6,0.0,155392,202,34552,149,...,4805,0,0,0,0,0,0,1,ransomware,7
1,192.168.1.193,49228,192.168.1.152,1880,6,0.0,1600,40,35741,65,...,237,0,0,0,0,0,0,0,Benign,0
2,192.168.1.152,0,192.168.1.193,0,1,0.0,212,2,0,0,...,0,771,3,0,0,0,0,0,Benign,0
3,192.168.1.169,65317,239.255.255.250,1900,17,0.0,165,1,0,0,...,0,0,0,0,0,0,0,0,Benign,0
4,192.168.1.79,60766,192.168.1.255,15600,17,0.0,63,1,0,0,...,0,0,0,0,0,0,0,0,Benign,0


In [6]:
cols_to_norm = list(set(list(df.columns ))  - set(list([dataset.label_col]))  - set(list([dataset.class_num_col])) - set(dataset.drop_columns)  - set(dataset.weak_columns))
cols_to_norm

['NUM_PKTS_512_TO_1024_BYTES',
 'FTP_COMMAND_RET_CODE',
 'L7_PROTO',
 'SERVER_TCP_FLAGS',
 'OUT_PKTS',
 'SRC_TO_DST_SECOND_BYTES',
 'RETRANSMITTED_IN_BYTES',
 'DNS_TTL_ANSWER',
 'NUM_PKTS_256_TO_512_BYTES',
 'NUM_PKTS_128_TO_256_BYTES',
 'TCP_WIN_MAX_OUT',
 'DURATION_OUT',
 'NUM_PKTS_1024_TO_1514_BYTES',
 'FLOW_DURATION_MILLISECONDS',
 'MAX_IP_PKT_LEN',
 'RETRANSMITTED_OUT_PKTS',
 'RETRANSMITTED_IN_PKTS',
 'OUT_BYTES',
 'SHORTEST_FLOW_PKT',
 'DNS_QUERY_TYPE',
 'IN_PKTS',
 'ICMP_IPV4_TYPE',
 'DNS_QUERY_ID',
 'MIN_IP_PKT_LEN',
 'NUM_PKTS_UP_TO_128_BYTES',
 'SRC_TO_DST_AVG_THROUGHPUT',
 'DST_TO_SRC_SECOND_BYTES',
 'DST_TO_SRC_AVG_THROUGHPUT']

In [7]:
df[dataset.label_col].value_counts()

Label
1    10840961
0     6088816
Name: count, dtype: int64

In [8]:
if sort_timestamp:
    df[dataset.timestamp_col] = pd.to_datetime(df[dataset.timestamp_col].str.strip(), format=dataset.timestamp_format)
    df.sort_values(dataset.timestamp_col, inplace=True)

In [9]:
if window_graph and line_graph:
    
    df.reset_index(drop=True, inplace=True)
    df.reset_index(drop=False, inplace=True)
    
    scaler = StandardScaler()

    df[cols_to_norm] = scaler.fit_transform(df[cols_to_norm])
    df['h'] = df[ cols_to_norm ].values.tolist()
    
    folder_path="datasets/" + name + "/line_graph_unsorted"
    if sort_timestamp:
        folder_path="datasets/" + name + "/line_graph_sorted"
        
    create_weightless_window_graph(
        df=df,
        src_ip_col=dataset.src_ip_col,
        dst_ip_col=dataset.dst_ip_col,
        window_size=window_size,
        line_graph=True,
        folder_path=folder_path,
        # folder_path=None,
        # test_percentage = 20,
        edge_attr= ['h',dataset.label_col, "index"],
        file_type="pkl")    

In [10]:
if session_graph:
    folder_path="datasets/" + name + "/session_graphs"
        
    df2 = define_sessions(df,
                          src_ip_col=dataset.src_ip_col,
                          src_port_col=dataset.src_port_col,
                          dst_ip_col=dataset.dst_ip_col,
                          dst_port_col=dataset.dst_port_col,
                        #   protocol_col=dataset.
    )
    
    create_weightless_session_graph(df,
                                    src_ip_col = dataset.src_ip_col,
                                    dst_ip_col = dataset.dst_ip_col,
                                    multi_graph=multi_graph,
                                    line_graph=line_graph,
                                    folder_path=folder_path)

In [11]:
if not window_graph:
	y = df[dataset.label_col]
 
	if sort_timestamp:
		X_train, X_test, y_train, y_test = train_test_split(
			df, y, test_size=0.3)
	else:
		X_train, X_test, y_train, y_test = train_test_split(
			df, y, test_size=0.3, random_state=13, stratify=y)

	del df

	print(f"==>> X_train.shape: {X_train.shape}")
	print(f"==>> X_train.shape: {X_test.shape}")

	scaler = StandardScaler()

	X_train[cols_to_norm] = scaler.fit_transform(X_train[cols_to_norm])
	X_train['h'] = X_train[ cols_to_norm ].values.tolist()

	X_test[cols_to_norm] = scaler.transform(X_test[cols_to_norm])
	X_test['h'] = X_test[ cols_to_norm ].values.tolist()

==>> X_train.shape: (11850843, 46)
==>> X_train.shape: (5078934, 46)


  temp **= 2
  new_unnormalized_variance -= correction**2 / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2


In [12]:
if flow_graph:
    G = nx.from_pandas_edgelist(X_train, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col], create_using=nx.MultiDiGraph())
    # get netowrk properties
    graph_measures = calculate_graph_measures(G, "datasets/" + name + "/training_graph_measures.json", verbose=True)
    print(f"==>> graph_measures: {graph_measures}")

    # graph_measures = calculate_graph_measures(nx.DiGraph(G), "datasets/" + name + "/training_graph_simple_measures.json", verbose=True)
    # print(f"==>> graph_measures: {graph_measures}")

    with open("datasets/" + name + "/training_graph.pkl", "wb") as f:
        pickle.dump(G, f)

==>> number_of_nodes: 21883, in 2.0701438188552856e-05 seconds
==>> number_of_edges: 11850843, in 0.09248649887740612 seconds
==>> calculated degrees, in 0.07947319932281971 seconds
==>> density: 0.024748869780723444, in 0.07855300046503544 seconds
==>> graph_measures: {'number_of_nodes': 21883, 'number_of_edges': 11850843, 'max_degree': 3182465, 'avg_degree': 1083.1095370835808, 'density': 0.024748869780723444}


In [13]:
if flow_graph:
    G_test = nx.from_pandas_edgelist(X_test, dataset.src_ip_col, dataset.dst_ip_col, ['h',dataset.label_col],create_using=nx.MultiDiGraph())
    
    graph_measures = calculate_graph_measures(G_test, "datasets/" + name + "/testing_graph_measures.json", verbose=True)
    print(f"==>> graph_measures: {graph_measures}")
    
    # graph_measures = calculate_graph_measures(nx.DiGraph(G_test), "datasets/" + name + "/testing_graph_simple_measures.json", verbose=True)
    # print(f"==>> graph_measures: {graph_measures}")
    
    with open("datasets/" + name + "/testing_graph.pkl", "wb") as f:
        pickle.dump(G_test, f)

==>> number_of_nodes: 11457, in 2.5700777769088745e-05 seconds
==>> number_of_edges: 5078934, in 0.050220098346471786 seconds
==>> calculated degrees, in 0.03748579882085323 seconds
==>> density: 0.03869622959884494, in 0.03478199988603592 seconds
==>> graph_measures: {'number_of_nodes': 11457, 'number_of_edges': 5078934, 'max_degree': 1365075, 'avg_degree': 886.6080125687353, 'density': 0.03869622959884494}


In [14]:
# if window_graph and line_graph:

#     create_weightless_window_graph(
#         df=X_test,
#         src_ip_col=dataset.src_ip_col,
#         dst_ip_col=dataset.dst_ip_col,
#         window_size=window_size,
#         line_graph=True,
#         folder_path="datasets/" + name + "/line_graph/testing",
#         file_type="pkl")

In [15]:
# nx.write_gexf(G_test, "datasets/cic_ton_iot/testing_graph.gexf")