In [81]:
import networkx as nx
import pandas as pd
import random
import torch
import pickle
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
import numpy as np

In [82]:
import torch
from torch_geometric.utils.convert import from_networkx
from pygod.detector import DOMINANT, OCGNN, GUIDE, GAE, GAAN, AnomalyDAE, CONAD
from pygod.metric import eval_f1, eval_precision_at_k, eval_recall_at_k
from pygod.generator import gen_contextual_outlier, gen_structural_outlier
import pickle
import time
device = torch.device('cuda')

In [3]:
import torchvision.models as models
import torch_geometric.nn as pyg_nn

In [242]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |     990 MB |    8065 MB |    6546 GB |    6545 GB |
|       from large pool |     987 MB |    8042 MB |    6531 GB |    6531 GB |
|       from small pool |       2 MB |      30 MB |      14 GB |      14 GB |
|---------------------------------------------------------------------------|
| Active memory         |     990 MB |    8065 MB |    6546 GB |    6545 GB |
|       from large pool |     987 MB |    8042 MB |    6531 GB |    6531 GB |
|       from small pool |       2 MB |      30 MB |      14 GB |      14 GB |
|---------------------------------------------------------------

In [6]:
unsw_labeled_path = "C:\\Users\\asus\\Documents\\nids-pcap-dataset\\unsw_parquet_used_dataset\\unsw_labeled.parquet"

In [7]:
unsw = pd.read_parquet(unsw_labeled_path)

In [8]:
def split_train_test(df, test_size=0.3):
    train, test = train_test_split(df, test_size=test_size)
    return train, test

In [9]:
def graph_modeling_1(df):
    graph = nx.Graph()
    node_features = []
    labels = []

    for source_port_info in df["source_port_info"].unique():
        graph.add_node(source_port_info)
        info_message = df[df["source_port_info"] == source_port_info]["info_message"].iloc[0]
        label = df[df["source_port_info"] == source_port_info]["is_malware"].iloc[0]
        node_features.append([float(len(info_message))])
        labels.append(label)
        
    for (source_ip), group in df.groupby(["source_ip"]):
        for i in range(len(group) - 1):
            from_node = group.iloc[i]["source_port_info"]
            to_node = group.iloc[i+1]["source_port_info"]
            if graph.has_edge(from_node, to_node):
                graph[from_node][to_node]["weight"] += 1
            else:
                graph.add_edge(from_node, to_node, weight=1)
    return graph, node_features, labels

In [10]:
train_df, test_df = split_train_test(unsw)

In [11]:
train_df.is_malware.value_counts()

is_malware
0    77151
1    10475
Name: count, dtype: int64

In [12]:
test_df.is_malware.value_counts()

is_malware
0    33123
1     4431
Name: count, dtype: int64

In [13]:
train_graph, train_node_features, label_train = graph_modeling_1(train_df)

In [17]:
test_graph, test_node_features, label_test = graph_modeling_1(test_df)

In [14]:
train_graph.number_of_nodes()

71374

In [15]:
pickle.dump(train_graph, open('model_graph/train_graph_f.pkl', 'wb'))
pickle.dump(train_node_features, open('model_graph/train_node_features_f.pkl', 'wb'))
pickle.dump(label_train, open('model_graph/label_train_f.pkl', 'wb'))

In [18]:
pickle.dump(test_graph, open('model_graph/test_graph_f.pkl', 'wb'))
pickle.dump(test_node_features, open('model_graph/test_node_features_f.pkl', 'wb'))
pickle.dump(label_test, open('model_graph/label_test_f.pkl', 'wb'))

In [5]:
train_graph = pickle.load(open('model_graph/train_graph.pkl', 'rb'))
label_train = pickle.load(open('model_graph/label_train.pkl', 'rb'))
train_node_features = pickle.load(open('model_graph/train_node_features.pkl', 'rb'))

In [6]:
test_graph = pickle.load(open('model_graph/test_graph.pkl', 'rb'))
label_test = pickle.load(open('model_graph/label_test.pkl', 'rb'))
test_node_features = pickle.load(open('model_graph/test_node_features.pkl', 'rb'))

## DOMINANT

In [103]:

def make_dominant_model(train_graph, train_node_features, 
                        label_train, test_graph, test_node_features):
    
    pyG_train = from_networkx(train_graph)
    pyG_train = pyG_train
    pyG_train.x = train_node_features

    pyG_test = from_networkx(test_graph)
    pyG_test = pyG_test
    pyG_test.x = test_node_features

    dominant_model = DOMINANT(gpu=0, weight=0.1, num_layers=8, hid_dim=64, backbone= pyg_nn.EdgeCNN, contamination=0.37, lr=0.001, verbose=3, epoch=100)  
    dominant_compile = dominant_model.fit(pyG_train)
    return dominant_compile, pyG_test

In [104]:
def predict_dominant(label_test, dominant_compile, pyG_test):
    
    dominant_ip_pred_res, dominant_ip_score_res = dominant_compile.predict(data=pyG_test, label = label_test, return_pred=True, return_score=True, prob_method='linear')
    
    unique_values, counts = torch.unique(dominant_ip_pred_res, return_counts=True)
    print(unique_values, counts)

    predictions = dominant_ip_pred_res.numpy()
    labels = label_test.numpy()
    TP = np.sum((labels == 1) & (predictions == 1))
    FN = np.sum((labels == 1) & (predictions == 0))
    FP = np.sum((labels == 0) & (predictions == 1))
    TN = np.sum((labels == 0) & (predictions == 0))

    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    confusion_matrix = metrics.confusion_matrix(labels, predictions)

    f1_pygod = eval_f1(label_test, dominant_ip_pred_res)
    precision_pygod = eval_precision_at_k(label_test, dominant_ip_score_res)
    recall_pygod = eval_recall_at_k(label_test, dominant_ip_score_res)
    print("F1 score: ", f1_score)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print(confusion_matrix)
    return precision_pygod, recall_pygod, f1_pygod

In [105]:
dominant_model, graph_test = make_dominant_model(train_graph, train_node_features, label_train, test_graph, test_node_features)
precision_score, recall_score, f1_score_for = predict_dominant(label_test, dominant_model, graph_test)

Epoch 0000: Loss 51.0959 |  | Time 3.66
Epoch 0001: Loss 47.2797 |  | Time 14.61
Epoch 0002: Loss 43.1999 |  | Time 14.71
Epoch 0003: Loss 39.2517 |  | Time 14.66
Epoch 0004: Loss 35.7830 |  | Time 14.71
Epoch 0005: Loss 32.6781 |  | Time 14.61
Epoch 0006: Loss 29.7531 |  | Time 14.59
Epoch 0007: Loss 27.3400 |  | Time 14.70
Epoch 0008: Loss 25.1656 |  | Time 14.70
Epoch 0009: Loss 23.2403 |  | Time 14.59
Epoch 0010: Loss 21.4210 |  | Time 14.60
Epoch 0011: Loss 19.8593 |  | Time 14.68
Epoch 0012: Loss 18.4028 |  | Time 14.81
Epoch 0013: Loss 17.0949 |  | Time 14.74
Epoch 0014: Loss 15.8632 |  | Time 14.86
Epoch 0015: Loss 14.7636 |  | Time 14.66
Epoch 0016: Loss 13.8048 |  | Time 14.78
Epoch 0017: Loss 12.9547 |  | Time 14.80
Epoch 0018: Loss 12.2212 |  | Time 14.81
Epoch 0019: Loss 11.5913 |  | Time 14.82
Epoch 0020: Loss 11.0514 |  | Time 14.76
Epoch 0021: Loss 10.5942 |  | Time 14.80
Epoch 0022: Loss 10.2035 |  | Time 14.87
Epoch 0023: Loss 9.8593 |  | Time 14.79
Epoch 0024: Loss 9

In [106]:
# print("ini f1: ", f1)
print("ini precision: ", precision_score)
print("ini recall: ", recall_score)
print("ini f1_score_unv: ", f1_score_for)

ini precision:  tensor(0.7065)
ini recall:  tensor(0.7065)
ini f1_score_unv:  0.7008086253369271


## OCGNN

In [141]:
def predict_ocgnn(label_test, ocgnn_compile, pyG_test):
    ocgnn_ip_pred_res, ocgnn_ip_score_res = ocgnn_compile.predict(data=pyG_test, label = label_test,return_pred=True, return_score=True, prob_method='linear')
    
    unique_values, counts = torch.unique(ocgnn_ip_pred_res, return_counts=True)
    print(unique_values, counts)

    predictions = ocgnn_ip_pred_res.numpy()
    labels = label_test.numpy()
    TP = np.sum((labels == 1) & (predictions == 1))
    FN = np.sum((labels == 1) & (predictions == 0))
    FP = np.sum((labels == 0) & (predictions == 1))
    TN = np.sum((labels == 0) & (predictions == 0))


    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    confusion_matrix = metrics.confusion_matrix(labels, predictions)

    precision_pygod = eval_precision_at_k(label_test, ocgnn_ip_score_res)
    recall_pygod = eval_recall_at_k(label_test, ocgnn_ip_score_res)
    f1_pygod = eval_f1(label_test, ocgnn_ip_pred_res)
    
    print("F1 score: ", f1_score)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print(confusion_matrix)
    return precision_pygod, recall_pygod, f1_pygod

In [193]:
def make_ocgnn_model(train_graph, train_node_features, 
                        label_train, test_graph, test_node_features,
                        label_test):
    
    train_node_features = torch.tensor(train_node_features)
    label_train = torch.tensor(label_train)
    test_node_features = torch.tensor(test_node_features)

    pyG_train = from_networkx(train_graph)
    pyG_train = pyG_train.cpu()
    pyG_train.x = train_node_features.cpu()
    label_train = label_train.cpu()

    pyG_test = from_networkx(test_graph)
    pyG_test = pyG_test
    pyG_test.x = test_node_features

    ocgnn_model = OCGNN(hid_dim=14, num_layers=16, weight_decay=1, 
                    contamination=0.37, lr=0.001, epoch=100, gpu=-1, verbose=3)
    ocgnn_compile = ocgnn_model.fit(pyG_train)
    return ocgnn_compile, pyG_test


In [1]:
ocgnn_model, graph_test = make_ocgnn_model(train_graph, train_node_features, label_train, test_graph, test_node_features, label_test)
make_model_runtime = time.time()
precision_score, recall_score, f1_score_for = predict_ocgnn(label_test, ocgnn_model, graph_test)
print("ini f1: ", f1_score_for)
print("ini precision: ", precision_score)
print("ini recall: ", recall_score)


NameError: name 'make_ocgnn_model' is not defined

In [170]:
print(f1_ocgnn)
print(precision_ocgnn)
print(recall_ocgnn)

NameError: name 'f1_ocgnn' is not defined

In [None]:
print(train_durration_ocgnn)
print(predict_durration_ocgnn)

[37.998138189315796, 34.89245128631592, 37.14145493507385]
[1.1997323036193848, 1.2506287097930908, 0.970118522644043]


## GAE

In [243]:
def make_gae_model(train_graph, train_node_features, 
                        label_train, test_graph, test_node_features,
                        label_test):
    
    pyG_train = from_networkx(train_graph)
    pyG_train = pyG_train
    pyG_train.x = train_node_features
    label_train = label_train

    pyG_test = from_networkx(test_graph)
    pyG_test = pyG_test
    pyG_test.x = test_node_features

    gae_model = GAE(hid_dim=12, num_layers=12, weight_decay=3,
                contamination=0.37, lr=0.001, epoch=100, gpu=-1,
                verbose=3, recon_s=True, sigmoid_s=True)
    
    gae_compile = gae_model.fit(pyG_train, label_train)
    return gae_compile, pyG_test


In [244]:
def predict_gae(label_test, gae_compile, pyG_test):
    gae_ip_pred_res, gae_ip_score_res = gae_compile.predict(data=pyG_test, label = label_test,return_pred=True, return_score=True, prob_method='linear')
    f1_score_pygod = eval_f1(label_test, gae_ip_pred_res)
    precision = eval_precision_at_k(label_test, gae_ip_score_res)
    recall = eval_recall_at_k(label_test, gae_ip_score_res)
    f1_score = 2*(precision*recall)/(precision+recall)
    unique_values, counts = torch.unique(gae_ip_pred_res, return_counts=True)
    print(unique_values, counts)
    print("F1 score: ", f1_score)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 score pygod: ", f1_score_pygod)
    return f1_score_pygod, precision, recall, f1_score

In [246]:
gae_model, graph_test = make_gae_model(train_graph, train_node_features, label_train, test_graph, test_node_features, label_test)
f1_score_pygod, precision_score, recall_score, f1_score = predict_gae(label_test, gae_model, graph_test)

Epoch 0000: Loss 0.4872 | AUC 0.9597 | Recall 0.8432 | Precision 0.8432 | AP 0.9354 | F1 0.8432 | Time 3.81
Epoch 0001: Loss 0.4623 | AUC 0.9597 | Recall 0.8432 | Precision 0.8432 | AP 0.9355 | F1 0.8432 | Time 3.86
Epoch 0002: Loss 0.4387 | AUC 0.9598 | Recall 0.8433 | Precision 0.8433 | AP 0.9355 | F1 0.8433 | Time 3.92
Epoch 0003: Loss 0.4167 | AUC 0.9598 | Recall 0.8433 | Precision 0.8433 | AP 0.9356 | F1 0.8433 | Time 3.93
Epoch 0004: Loss 0.3962 | AUC 0.9598 | Recall 0.8432 | Precision 0.8432 | AP 0.9356 | F1 0.8432 | Time 3.90
Epoch 0005: Loss 0.3773 | AUC 0.9599 | Recall 0.8432 | Precision 0.8432 | AP 0.9357 | F1 0.8432 | Time 4.00
Epoch 0006: Loss 0.3601 | AUC 0.9599 | Recall 0.8433 | Precision 0.8433 | AP 0.9357 | F1 0.8433 | Time 4.09
Epoch 0007: Loss 0.3447 | AUC 0.9600 | Recall 0.8433 | Precision 0.8433 | AP 0.9358 | F1 0.8433 | Time 4.00
Epoch 0008: Loss 0.3313 | AUC 0.9600 | Recall 0.8437 | Precision 0.8437 | AP 0.9359 | F1 0.8437 | Time 3.92
Epoch 0009: Loss 0.3195 | AU