In [None]:
number_of_layers = 3
size_of_layer = 256
attack_name = "cli"
batch_size = 1
epochs = 50

In [None]:
# get data from drive
from google.colab import drive

drive.mount('/content/gdrive')
drive_path = "/content/gdrive/MyDrive/fyp_data/"

In [None]:
%pip install networkx==2.5
%pip install  dgl -f https://data.dgl.ai/wheels/repo.html
%pip install  dglgo -f https://data.dgl.ai/wheels-test/repo.html
!pip install ipython-autotime
%load_ext autotime

Looking in links: https://data.dgl.ai/wheels/repo.html
Looking in links: https://data.dgl.ai/wheels-test/repo.html
The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 21.7 s (started: 2023-10-12 04:04:41 +00:00)


In [None]:
# import required packages
import matplotlib.pyplot as plt
import networkx as nx
import gzip, pickle
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl.data
from dgl.data import DGLDataset
import torch as th
import json
from collections import defaultdict
import numpy as np
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from collections import OrderedDict
from dgl.nn.pytorch import GraphConv
import os
from tqdm import tqdm

import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
from dgl.dataloading import GraphDataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# set constant seed for consistency across multiple settings
torch.manual_seed(0)
dgl.seed(0)

device_name = torch.device('cpu')
print("Using {}.".format(device_name))

Using cpu.
time: 10.4 ms (started: 2023-10-12 04:05:18 +00:00)


In [None]:
node_types = ['argv', 'block', 'address', 'task', 'process_memory', 'file', 'socket', 'pipe', 'iattr', 'link', 'machine', 'path', 'shm']
edge_types = ['wasAssociatedWith', 'used', 'wasGeneratedBy', 'wasInformedBy', 'wasDerivedFrom', 'wasAttributedTo', 'actedOnBehalfOf']

node_features_count = len(node_types)
edge_features_count = len(edge_types)

targets = np.array(node_types)
nodelabelEnc = LabelEncoder()
new_target = nodelabelEnc.fit_transform(targets)
nodeencoder = OneHotEncoder(sparse_output=False)
nodeencoder.fit(new_target.reshape(-1, 1))
nodeencoder = nodeencoder

targets = np.array(edge_types)
edgelabelEnc = LabelEncoder()
new_target = edgelabelEnc.fit_transform(targets)
edgeencoder = OneHotEncoder(sparse_output=False)
edgeencoder.fit(new_target.reshape(-1, 1))
edgeencoder = edgeencoder


def one_hot_node(data):
  new_target = nodelabelEnc.transform(np.array(data))
  return torch.from_numpy(nodeencoder.transform(new_target.reshape(-1, 1)))

def one_hot_edge(data):
  new_target = edgelabelEnc.transform(np.array(data))
  return torch.from_numpy(edgeencoder.transform(new_target.reshape(-1, 1)))


class ProvenanceDataset(DGLDataset):
    def __init__(self):
      super().__init__(name='provenance')

    def read_graph(self, file_name):
      graph_raw = json.load(open(file_name,"r"))

      list_of_nodes = defaultdict(set)

      node_types = {}
      edge_types = []

      out_edges = []
      in_edges = []

      for key, value in graph_raw.items():
        node1_type, edge_type, node2_type = key.split("-")
        node1_index, node2_index = value
        for i in node1_index:
          node_types[i] = node1_type
          edge_types.append(edge_type)
        for i in node2_index:
          node_types[i] = node2_type


        out_edges = out_edges + node1_index
        in_edges = in_edges + node2_index

      node_types = OrderedDict(sorted(node_types.items()))
      number_of_edges = list(node_types.keys())[-1] + 1

      for i in range(number_of_edges):
        if i not in node_types:
          node_types[i] = 'task'

      out_edges = th.tensor(out_edges)
      in_edges = th.tensor(in_edges)

      g = dgl.graph((out_edges, in_edges))

      g.ndata['attr'] = (one_hot_node(list(node_types.values())))
      g.edata['attr'] = (one_hot_edge(edge_types))

      g = dgl.remove_self_loop(g)
      g = dgl.add_self_loop(g)

      return g

    def process(self):
        self.graphs = []
        self.labels = []

        # attack graph ID
        for graph_id in tqdm(range(1, 2001)):
          g = self.read_graph(f"{drive_path}{attack_name}_attack/graph{graph_id}.json")
          self.graphs.append(g)
          self.labels.append(1)

        # benign graph ID
        for graph_id in tqdm(range(1, 2001)):
          g = self.read_graph(f"{drive_path}{attack_name}_benign/graph{graph_id}.json")
          self.graphs.append(g)
          self.labels.append(0)

        # Convert the label list to tensor for saving.
        self.labels = torch.LongTensor(self.labels)

    def __getitem__(self, i):
        return self.graphs[i], self.labels[i]

    def __len__(self):
        return len(self.graphs)

dataset = ProvenanceDataset()
graph, label = dataset[0]
print(graph, label, graph.device, label.device)

100%|██████████| 2000/2000 [01:23<00:00, 24.03it/s]
100%|██████████| 2000/2000 [03:08<00:00, 10.58it/s]

Graph(num_nodes=1196, num_edges=2775,
      ndata_schemes={'attr': Scheme(shape=(13,), dtype=torch.float64)}
      edata_schemes={'attr': Scheme(shape=(7,), dtype=torch.float64)}) tensor(1) cpu cpu
time: 4min 32s (started: 2023-10-12 04:05:35 +00:00)





In [None]:
train_set, test_set, validation_set = dgl.data.utils.split_dataset(dataset, [0.8, 0.1, 0.1], shuffle=True, random_state=42)
data_loader = GraphDataLoader(train_set, batch_size=batch_size, shuffle=True, pin_memory=True)
validation_loader = GraphDataLoader(validation_set, batch_size=1, shuffle=True, pin_memory=True)
test_loader = GraphDataLoader(test_set, batch_size=1, shuffle=True, pin_memory=True)

time: 5.83 ms (started: 2023-10-12 04:13:56 +00:00)


In [None]:
class Classifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes, hidden_layer):
        super(Classifier, self).__init__()
        self.conv1 = GraphConv(in_dim, hidden_dim)
        self.hidden_layer = hidden_layer
        self.hidden_dim = hidden_dim
        if self.hidden_layer == 2:
          self.conv2 = GraphConv(hidden_dim, hidden_dim)
        if self.hidden_layer == 3:
          self.conv3 = GraphConv(hidden_dim, hidden_dim)
        self.classify = nn.Linear(hidden_dim, n_classes)

    def forward(self, g):
        # Use node degree as the initial node feature. For undirected graphs, the in-degree
        # is the same as the out_degree.
        h = g.ndata['attr'].float()
        # Perform graph convolution and activation function.
        h = F.relu(self.conv1(g, h))
        if self.hidden_layer == 2:
          h = F.relu(self.conv2(g, h))
        if self.hidden_layer == 3:
          h = F.relu(self.conv3(g, h))
        g.ndata['h'] = h
        # Calculate graph representation by averaging all the node representations.
        hg = dgl.mean_nodes(g, 'h')

        return self.classify(hg)

    def print_params(self):
        print(f"{self.hidden_layer} hidden layers. {self.hidden_dim} hidden dim.")

time: 1.04 ms (started: 2023-10-12 04:13:59 +00:00)


In [None]:
def compute_metrics(preds: torch.Tensor, labels: torch.Tensor, threshold: float = 0.5):
  is_multiclass = labels.max().item() > 1
  if is_multiclass:
      preds = torch.argmax(preds, dim=-1)
      probs = preds.tolist()  # Predicted class not raw probs
  else:
      probs = preds.tolist()
      preds = (preds > threshold).float()

  return {
      'accuracy': accuracy_score(preds, labels),
      'precision': precision_score(preds, labels, average='micro' if is_multiclass else 'binary'),
      'recall': recall_score(preds, labels, average='micro' if is_multiclass else 'binary'),
      'F1 micro': f1_score(preds, labels, average='micro'),
      'F1 macro': f1_score(preds, labels, average='macro'),
      'probs': probs,
      'labels': labels.tolist(),
  }

def eval_model(function_model, dataset_loader, name):
  predictions = []
  labels = []

  with torch.no_grad():
      for iter, (bg, label) in enumerate(dataset_loader):
          prediction = function_model(bg)
          probs_Y = torch.softmax(prediction, 1)
          argmax_Y = torch.max(probs_Y, 1)[1].view(-1, 1)
          predictions.append(argmax_Y)
          labels.append(label)

  argmax_Y = torch.cat(predictions, dim=0)
  test_Y = torch.cat(labels, dim=0)

  print(f"{name}: {accuracy_score(argmax_Y, test_Y)}")

  return

time: 3.21 ms (started: 2023-10-12 04:14:03 +00:00)


In [None]:
models = []

for hidden_layer in [number_of_layers]:
  for hidden_dim in [size_of_layer]:
    model = Classifier(13, hidden_dim, 2, hidden_layer)
    models.append(model)
# Create model
# model = Classifier(13, 256, 2)
# use pretrained model
# model.load_state_dict(torch.load(f"{drive_path}model.pth"))

time: 2.86 ms (started: 2023-10-12 04:14:06 +00:00)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(preds: torch.Tensor, labels: torch.Tensor, threshold: float = 0.5):
  is_multiclass = labels.max().item() > 1
  if is_multiclass:
      preds = torch.argmax(preds, dim=-1)
      probs = preds.tolist()  # Predicted class not raw probs
  else:
      probs = preds.tolist()
      preds = (preds > threshold).float()

  return {
      'accuracy': accuracy_score(preds, labels),
      'precision': precision_score(preds, labels, average='micro' if is_multiclass else 'binary'),
      'recall': recall_score(preds, labels, average='micro' if is_multiclass else 'binary'),
      'F1 micro': f1_score(preds, labels, average='micro'),
      'F1 macro': f1_score(preds, labels, average='macro'),
      'F1': f1_score(preds, labels, average='binary'),
      'CM': confusion_matrix(preds, labels)
  }

#torch.save(model.state_dict(), "model.pth")
def print_metrics(model, loader):
  predictions = []
  labels = []

  with torch.no_grad():
      model.eval()
      for iter, (bg, label) in enumerate(loader):
          prediction = model(bg)
          probs_Y = torch.softmax(prediction, 1)
          argmax_Y = torch.max(probs_Y, 1)[1].view(-1, 1)
          predictions.append(argmax_Y)
          labels.append(label)
      model.train()

  argmax_Y = torch.cat(predictions, dim=0)
  test_Y = torch.cat(labels, dim=0)

  metrics = compute_metrics(argmax_Y, test_Y)
  for metric, value in metrics.items():
    print(f"{metric}: {value}")

time: 3.53 ms (started: 2023-10-12 04:14:08 +00:00)


In [None]:
for model in models:
  model.print_params()
  loss_func = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters(), lr=0.001)

  # prepares for training
  model.train()
  epoch_losses = []
  best_validation_loss = 100000000000
  epochs_without_gain = 0
  end_training = False

  for epoch in tqdm(range(epochs)):
      epoch_loss = 0
      for i in enumerate(data_loader):
          iter, (bg, label) = i
          prediction = model(bg)
          loss = loss_func(prediction, label)
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          epoch_loss += loss.detach().item()

      model.eval()

      epoch_loss /= (iter + 1)

      validation_loss = 0
      for iter, (bg, label) in enumerate(validation_loader):
          prediction = model(bg)
          validation_loss += loss_func(prediction, label).detach().item()

      validation_loss /= (iter + 1)

      if validation_loss < best_validation_loss and epoch >= 30:
        epochs_without_gain = 0
        best_validation_loss = validation_loss
        eval_model(model, data_loader, "train")
        eval_model(model, validation_loader, "validation")
        eval_model(model, test_loader, "test")
        torch.save(model.state_dict(), f"{drive_path}{attack_name}_model{model.hidden_dim}_{model.hidden_layer}.pth")
        print_metrics(model, test_loader)
      #elif epoch >= 30:
      #  epochs_without_gain += 1
      #  if epochs_without_gain > 5:
      #    end_training = True

      model.train()

      print('Epoch {}, loss {:.4f}'.format(epoch, epoch_loss))
      epoch_losses.append(epoch_loss)

      if end_training:
        break

3 hidden layers. 256 hidden dim.


  2%|▏         | 1/50 [04:04<3:19:32, 244.34s/it]

Epoch 0, loss 0.5805


  4%|▍         | 2/50 [08:05<3:13:53, 242.36s/it]

Epoch 1, loss 0.5228


  6%|▌         | 3/50 [12:07<3:09:49, 242.33s/it]

Epoch 2, loss 0.5029


  8%|▊         | 4/50 [16:02<3:03:38, 239.52s/it]

Epoch 3, loss 0.4900


 10%|█         | 5/50 [20:07<3:01:03, 241.41s/it]

Epoch 4, loss 0.4830


 12%|█▏        | 6/50 [24:55<3:08:30, 257.05s/it]

Epoch 5, loss 0.4839


 14%|█▍        | 7/50 [29:27<3:07:49, 262.08s/it]

Epoch 6, loss 0.4777


 16%|█▌        | 8/50 [34:12<3:08:31, 269.32s/it]

Epoch 7, loss 0.4763


 18%|█▊        | 9/50 [38:54<3:06:52, 273.47s/it]

Epoch 8, loss 0.4736


 20%|██        | 10/50 [43:27<3:02:02, 273.07s/it]

Epoch 9, loss 0.4675


 22%|██▏       | 11/50 [48:01<2:57:43, 273.42s/it]

Epoch 10, loss 0.4638


 24%|██▍       | 12/50 [52:43<2:54:56, 276.22s/it]

Epoch 11, loss 0.4639


 26%|██▌       | 13/50 [57:35<2:53:15, 280.96s/it]

Epoch 12, loss 0.4614


 28%|██▊       | 14/50 [1:02:12<2:47:48, 279.67s/it]

Epoch 13, loss 0.4603


 30%|███       | 15/50 [1:07:00<2:44:38, 282.23s/it]

Epoch 14, loss 0.4581


 32%|███▏      | 16/50 [1:11:30<2:37:52, 278.60s/it]

Epoch 15, loss 0.4543


 34%|███▍      | 17/50 [1:15:29<2:26:42, 266.73s/it]

Epoch 16, loss 0.4540


 36%|███▌      | 18/50 [1:19:30<2:18:01, 258.80s/it]

Epoch 17, loss 0.4531


 38%|███▊      | 19/50 [1:23:28<2:10:32, 252.67s/it]

Epoch 18, loss 0.4508


 40%|████      | 20/50 [1:27:24<2:03:46, 247.56s/it]

Epoch 19, loss 0.4463


 42%|████▏     | 21/50 [1:31:23<1:58:27, 245.08s/it]

Epoch 20, loss 0.4482


 44%|████▍     | 22/50 [1:35:30<1:54:34, 245.52s/it]

Epoch 21, loss 0.4452


 46%|████▌     | 23/50 [1:39:39<1:51:01, 246.73s/it]

Epoch 22, loss 0.4431


 48%|████▊     | 24/50 [1:43:50<1:47:23, 247.84s/it]

Epoch 23, loss 0.4414


 50%|█████     | 25/50 [1:47:55<1:42:56, 247.06s/it]

Epoch 24, loss 0.4425


 52%|█████▏    | 26/50 [1:52:02<1:38:50, 247.12s/it]

Epoch 25, loss 0.4382


 54%|█████▍    | 27/50 [1:56:10<1:34:51, 247.46s/it]

Epoch 26, loss 0.4377


 56%|█████▌    | 28/50 [2:00:19<1:30:51, 247.78s/it]

Epoch 27, loss 0.4324


 58%|█████▊    | 29/50 [2:04:31<1:27:08, 248.95s/it]

Epoch 28, loss 0.4339


 60%|██████    | 30/50 [2:08:45<1:23:34, 250.74s/it]

Epoch 29, loss 0.4301
train: 0.8209375
validation: 0.805
test: 0.8225


 62%|██████▏   | 31/50 [2:15:21<1:33:06, 294.03s/it]

accuracy: 0.8225
precision: 0.8497409326424871
recall: 0.7961165048543689
F1 micro: 0.8225
F1 macro: 0.8224988906180662
F1: 0.8220551378446115
CM: [[165  29]
 [ 42 164]]
Epoch 30, loss 0.4303


In [None]:
for model in models:
  torch.save(model.state_dict(), f"{drive_path}{attack_name}_model{model.hidden_dim}_{model.hidden_layer}.pth")

time: 11.8 ms (started: 2023-10-09 06:18:00 +00:00)


time: 21.4 ms (started: 2023-10-09 06:18:00 +00:00)
