In [2]:
import networkx as nx
import pandas as pd
import random
import torch
import pickle
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
### import parquet file
unsw_w_st = pd.read_parquet("C:\\Users\\asus\\Documents\\nids-pcap-dataset\\unsw_parquet_used_dataset\\labeled_wo_start_time_w_dup[port].parquet")

In [4]:
unsw_w_st.info()

<class 'pandas.core.frame.DataFrame'>
Index: 125325 entries, 1 to 490022
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   source_ip         125325 non-null  object  
 1   destination_ip    125325 non-null  object  
 2   source_port       125325 non-null  object  
 3   destination_port  125325 non-null  object  
 4   info_message      125325 non-null  object  
 5   attack_category   15801 non-null   category
 6   is_malware        125325 non-null  int64   
 7   source_ip_info    125325 non-null  object  
 8   source_port_info  125325 non-null  object  
 9   dest_ip_info      125325 non-null  object  
 10  dest_port_info    125325 non-null  object  
 11  count_benign      125325 non-null  int64   
 12  count_malware     125325 non-null  int64   
dtypes: category(1), int64(3), object(9)
memory usage: 12.5+ MB


In [5]:
unsw_w_st['source_port'] = unsw_w_st.source_port.astype('int32')
unsw_w_st['destination_port']= unsw_w_st.destination_port.astype('int32')

In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
unsw_w_st['sip_ohe'] = label_encoder.fit_transform(unsw_w_st.source_ip.values)

In [7]:
unsw_w_st['lensip'] = unsw_w_st.info_message.str.len()

In [8]:
unsw_w_st.loc[unsw_w_st['is_malware'] == 1, 'sip_ohe'].value_counts().sort_values()

sip_ohe
2    3652
0    3710
1    3764
3    3925
Name: count, dtype: int64

### Split Training Testing

In [40]:
# nD_unsw_w_st = unsw_w_st.drop_duplicates(subset=['source_port_info'])

In [49]:
train_df, test_df = train_test_split(unsw_w_st, test_size=0.3, random_state=0)

In [50]:
train_df.is_malware.value_counts()

is_malware
0    77148
1    10579
Name: count, dtype: int64

In [51]:
nD_train_df = train_df.drop_duplicates(subset=['source_port_info'])
nD_train_df.is_malware.value_counts()

is_malware
0    61715
1     9774
Name: count, dtype: int64

In [52]:
value_counts = nD_train_df.is_malware.value_counts()
if value_counts[0] > 9774 and value_counts[1] > 1000:
    df_to_lower_benign = nD_train_df[nD_train_df['is_malware'] == 0].sample(n=9774, random_state=0)
    df_to_lower_malicious = nD_train_df[nD_train_df['is_malware'] == 1].sample(n=1000, random_state=0)
    nD_train_df = pd.concat([df_to_lower_benign, df_to_lower_malicious])

In [53]:
label_train = nD_train_df['is_malware'].to_numpy()
label_train_tensor = torch.tensor(label_train, dtype=torch.float)
value_counts = np.unique(label_train, return_counts=True)
value_counts

(array([0, 1], dtype=int64), array([9774, 1000], dtype=int64))

In [54]:
len(nD_train_df.source_port_info)

10774

In [55]:
test_df.is_malware.value_counts()

is_malware
0    33126
1     4472
Name: count, dtype: int64

In [56]:
nD_test_df = test_df.drop_duplicates(subset=['source_port_info'])
nD_test_df.is_malware.value_counts()

is_malware
0    29902
1     4260
Name: count, dtype: int64

In [57]:
value_counts = nD_test_df.is_malware.value_counts()
if value_counts[0] > 4260:
    df_to_lower_benign = nD_test_df[nD_test_df['is_malware'] == 0].sample(n=4260, random_state=0)
    nD_test_df = pd.concat([df_to_lower_benign, nD_test_df[nD_test_df['is_malware'] == 1]])

In [58]:
label_test = nD_test_df['is_malware'].to_numpy()
value_counts = np.unique(label_test, return_counts=True)
label_test_tensor = torch.tensor(label_test, dtype=torch.float)
value_counts

(array([0, 1], dtype=int64), array([4260, 4260], dtype=int64))

#### training graph

In [59]:
nD_graph_train = nx.Graph()
node_features = []
attr = []
labels = []

for source_port_info in nD_train_df["source_port_info"].unique():
    nD_graph_train.add_node(source_port_info)
    source_ip = nD_train_df[nD_train_df["source_port_info"] == source_port_info]["sip_ohe"].iloc[0]
    info_message = nD_train_df[nD_train_df["source_port_info"] == source_port_info]["info_message"].iloc[0]
    node_features.append([float(len(info_message))])
    
for (source_ip), group in nD_train_df.groupby(["source_ip"]):
    for i in range(len(group) - 1):
        from_node = group.iloc[i]["source_port_info"]
        to_node = group.iloc[i+1]["source_port_info"]
        if nD_graph_train.has_edge(from_node, to_node):
            nD_graph_train[from_node][to_node]["weight"] += 1
        else:
            nD_graph_train.add_edge(from_node, to_node, weight=1)

In [60]:
node_features_tensor = torch.tensor(node_features)
node_features_tensor = node_features_tensor.cuda()

In [61]:
node_features_tensor

tensor([[15.],
        [28.],
        [28.],
        ...,
        [51.],
        [27.],
        [52.]], device='cuda:0')

In [62]:
isolated_nodes = list(nx.isolates(nD_graph_train))

print("Isolated nodes:", len(isolated_nodes), "\n")

Isolated nodes: 0 



In [63]:
## print num of nodes
print("Number of nodes:", nD_graph_train.number_of_nodes(), "\n")

Number of nodes: 10774 



#### testing graph

In [64]:
nD_graph_test = nx.Graph()
node_features_test = []
attr = []
labels = []

for source_port_info in nD_test_df["source_port_info"].unique():
    nD_graph_test.add_node(source_port_info)
    # source_ip = nD_test_df[nD_test_df["source_port_info"] == source_port_info]["sip_ohe"].iloc[0]
    info_message = nD_test_df[nD_test_df["source_port_info"] == source_port_info]["info_message"].iloc[0]
    node_features_test.append([float(len(info_message))])
    
for (source_ip), group in nD_test_df.groupby(["source_ip"]):
    for i in range(len(group) - 1):
        from_node = group.iloc[i]["source_port_info"]
        to_node = group.iloc[i+1]["source_port_info"]
        if nD_graph_test.has_edge(from_node, to_node):
            nD_graph_test[from_node][to_node]["weight"] += 1
        else:
            nD_graph_test.add_edge(from_node, to_node, weight=1)

In [65]:
node_features_tensor_test = torch.tensor(node_features_test)
node_features_tensor_test = node_features_tensor_test.cuda()

In [66]:
node_features_tensor_test

tensor([[  28.],
        [  28.],
        [  28.],
        ...,
        [  20.],
        [1047.],
        [  67.]], device='cuda:0')

In [67]:
## print num of nodes
print("Number of nodes:", nD_graph_test.number_of_nodes(), "\n")

Number of nodes: 8520 



### PyGOD

In [68]:
import torch
from torch_geometric.utils.convert import from_networkx
from pygod.detector import ANOMALOUS, SCAN, Radar, DOMINANT, OCGNN, GUIDE
from pygod.metric import eval_average_precision, eval_roc_auc, eval_f1, eval_precision_at_k, eval_recall_at_k
from pygod.generator import gen_contextual_outlier, gen_structural_outlier
import pickle

In [69]:
pyG_train = from_networkx(nD_graph_train)
pyG_train = pyG_train.cuda()
pyG_train.x = node_features_tensor

pyG_test = from_networkx(nD_graph_test)
pyG_test.x = node_features_tensor_test

#### DOMINANT

In [129]:
dominant_model = DOMINANT(gpu=0, weight=0.02, num_layers=4, hid_dim=16, contamination=0.01, lr=0.001, verbose=2, epoch=100)

In [130]:
dominant_compile = dominant_model.fit(pyG_test, label_test_tensor)

Epoch 0000: Loss 2061.8242 | AUC 0.0000 | Time 0.42
Epoch 0001: Loss nan | 

ValueError: Input contains NaN.

#### OCGNN

In [150]:
ocgnn_model = OCGNN(hid_dim=64, num_layers=64, weight_decay=0.5, 
                    contamination=0.33, lr=0.001, epoch=100, batch_size=10000, gpu=-1, num_neigh=-1, 
                    beta=0.75, warmup=15, verbose=3, dropout=0.2)

In [151]:
ocgnn_compile = ocgnn_model.fit(pyG_train, label_train_tensor)

Epoch 0000: Loss 0.0000 | 

ValueError: Found input variables with inconsistent numbers of samples: [6339, 5486]

In [1]:
ocgnn_ip_pred_res, ocgnn_ip_score_res, ocgnn_ip_prob_res, ocgnn_ip_conf_res = ocgnn_compile.predict(data=pyG_test, label=label_test_tensor, return_pred=True, return_score=True, return_prob=True, prob_method='linear', return_conf=True)
unique_values, counts = torch.unique(ocgnn_ip_pred_res, return_counts=True)
print(unique_values, counts)

NameError: name 'ocgnn_compile' is not defined