In [1]:
number_of_layers = 3
size_of_layer = 256
attack_name = "sqli"
batch_size = 1
epochs = 50

In [2]:
# get data from drive
from google.colab import drive

drive.mount('/content/gdrive')
drive_path = "/content/gdrive/MyDrive/fyp_data/"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
%pip install networkx==2.5
%pip install  dgl -f https://data.dgl.ai/wheels/repo.html
%pip install  dglgo -f https://data.dgl.ai/wheels-test/repo.html
!pip install ipython-autotime
%load_ext autotime

Looking in links: https://data.dgl.ai/wheels/repo.html
Looking in links: https://data.dgl.ai/wheels-test/repo.html
time: 382 µs (started: 2023-10-12 06:46:19 +00:00)


In [4]:
# import required packages
import matplotlib.pyplot as plt
import networkx as nx
import gzip, pickle
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl.data
from dgl.data import DGLDataset
import torch as th
import json
from collections import defaultdict
import numpy as np
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from collections import OrderedDict
from dgl.nn.pytorch import GraphConv
import os
from tqdm import tqdm

import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
from dgl.dataloading import GraphDataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# set constant seed for consistency across multiple settings
torch.manual_seed(42)
dgl.seed(42)

device_name = torch.device('cpu')
print("Using {}.".format(device_name))

Using cpu.
time: 3.35 s (started: 2023-10-12 06:46:19 +00:00)


In [5]:
node_types = ['argv', 'block', 'address', 'task', 'process_memory', 'file', 'socket', 'pipe', 'iattr', 'link', 'machine', 'path', 'shm']
edge_types = ['wasAssociatedWith', 'used', 'wasGeneratedBy', 'wasInformedBy', 'wasDerivedFrom', 'wasAttributedTo', 'actedOnBehalfOf']

node_features_count = len(node_types)
edge_features_count = len(edge_types)

targets = np.array(node_types)
nodelabelEnc = LabelEncoder()
new_target = nodelabelEnc.fit_transform(targets)
nodeencoder = OneHotEncoder(sparse_output=False)
nodeencoder.fit(new_target.reshape(-1, 1))
nodeencoder = nodeencoder

targets = np.array(edge_types)
edgelabelEnc = LabelEncoder()
new_target = edgelabelEnc.fit_transform(targets)
edgeencoder = OneHotEncoder(sparse_output=False)
edgeencoder.fit(new_target.reshape(-1, 1))
edgeencoder = edgeencoder


def one_hot_node(data):
  new_target = nodelabelEnc.transform(np.array(data))
  return torch.from_numpy(nodeencoder.transform(new_target.reshape(-1, 1)))

def one_hot_edge(data):
  new_target = edgelabelEnc.transform(np.array(data))
  return torch.from_numpy(edgeencoder.transform(new_target.reshape(-1, 1)))


class ProvenanceDataset(DGLDataset):
    def __init__(self):
      super().__init__(name='provenance')

    def read_graph(self, file_name):
      graph_raw = json.load(open(file_name,"r"))

      list_of_nodes = defaultdict(set)

      node_types = {}
      edge_types = []

      out_edges = []
      in_edges = []

      for key, value in graph_raw.items():
        node1_type, edge_type, node2_type = key.split("-")
        node1_index, node2_index = value
        for i in node1_index:
          node_types[i] = node1_type
          edge_types.append(edge_type)
        for i in node2_index:
          node_types[i] = node2_type


        out_edges = out_edges + node1_index
        in_edges = in_edges + node2_index

      node_types = OrderedDict(sorted(node_types.items()))
      number_of_edges = list(node_types.keys())[-1] + 1

      for i in range(number_of_edges):
        if i not in node_types:
          node_types[i] = 'task'

      out_edges = th.tensor(out_edges)
      in_edges = th.tensor(in_edges)

      g = dgl.graph((out_edges, in_edges))

      g.ndata['attr'] = (one_hot_node(list(node_types.values())))
      g.edata['attr'] = (one_hot_edge(edge_types))

      g = dgl.remove_self_loop(g)
      g = dgl.add_self_loop(g)

      return g

    def process(self):
        self.graphs = []
        self.labels = []

        # attack graph ID
        for graph_id in tqdm(range(1, 2001)):
          g = self.read_graph(f"{drive_path}{attack_name}_attack/graph{graph_id}.json")
          self.graphs.append(g)
          self.labels.append(1)

        # benign graph ID
        for graph_id in tqdm(range(1, 2001)):
          g = self.read_graph(f"{drive_path}{attack_name}_benign/graph{graph_id}.json")
          self.graphs.append(g)
          self.labels.append(0)

        # Convert the label list to tensor for saving.
        self.labels = torch.LongTensor(self.labels)

    def __getitem__(self, i):
        return self.graphs[i], self.labels[i]

    def __len__(self):
        return len(self.graphs)

dataset = ProvenanceDataset()
graph, label = dataset[0]
print(graph, label, graph.device, label.device)

100%|██████████| 2000/2000 [14:45<00:00,  2.26it/s]
100%|██████████| 2000/2000 [11:26<00:00,  2.91it/s]

Graph(num_nodes=1191, num_edges=2713,
      ndata_schemes={'attr': Scheme(shape=(13,), dtype=torch.float64)}
      edata_schemes={'attr': Scheme(shape=(7,), dtype=torch.float64)}) tensor(1) cpu cpu
time: 26min 12s (started: 2023-10-12 06:46:44 +00:00)





In [6]:
train_set, test_set, validation_set = dgl.data.utils.split_dataset(dataset, [0.8, 0.1, 0.1], shuffle=True, random_state=42)
data_loader = GraphDataLoader(train_set, batch_size=batch_size, shuffle=True, pin_memory=True)
validation_loader = GraphDataLoader(validation_set, batch_size=1, shuffle=True, pin_memory=True)
test_loader = GraphDataLoader(test_set, batch_size=1, shuffle=True, pin_memory=True)

time: 18.9 ms (started: 2023-10-12 07:14:57 +00:00)


In [7]:
class Classifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes, hidden_layer):
        super(Classifier, self).__init__()
        self.conv1 = GraphConv(in_dim, hidden_dim)
        self.hidden_layer = hidden_layer
        self.hidden_dim = hidden_dim
        if self.hidden_layer == 2:
          self.conv2 = GraphConv(hidden_dim, hidden_dim)
        if self.hidden_layer == 3:
          self.conv3 = GraphConv(hidden_dim, hidden_dim)
        self.classify = nn.Linear(hidden_dim, n_classes)

    def forward(self, g):
        # Use node degree as the initial node feature. For undirected graphs, the in-degree
        # is the same as the out_degree.
        h = g.ndata['attr'].float()
        # Perform graph convolution and activation function.
        h = F.relu(self.conv1(g, h))
        if self.hidden_layer == 2:
          h = F.relu(self.conv2(g, h))
        if self.hidden_layer == 3:
          h = F.relu(self.conv3(g, h))
        g.ndata['h'] = h
        # Calculate graph representation by averaging all the node representations.
        hg = dgl.mean_nodes(g, 'h')

        return self.classify(hg)

    def print_params(self):
        print(f"{self.hidden_layer} hidden layers. {self.hidden_dim} hidden dim.")

time: 1.66 ms (started: 2023-10-12 07:15:06 +00:00)


In [8]:
def compute_metrics(preds: torch.Tensor, labels: torch.Tensor, threshold: float = 0.5):
  is_multiclass = labels.max().item() > 1
  if is_multiclass:
      preds = torch.argmax(preds, dim=-1)
      probs = preds.tolist()  # Predicted class not raw probs
  else:
      probs = preds.tolist()
      preds = (preds > threshold).float()

  return {
      'accuracy': accuracy_score(preds, labels),
      'precision': precision_score(preds, labels, average='micro' if is_multiclass else 'binary'),
      'recall': recall_score(preds, labels, average='micro' if is_multiclass else 'binary'),
      'F1 micro': f1_score(preds, labels, average='micro'),
      'F1 macro': f1_score(preds, labels, average='macro'),
      'F1': f1_score(preds, labels, average='binary'),
      'probs': probs,
      'labels': labels.tolist(),
  }

def eval_model(function_model, dataset_loader, name):
  predictions = []
  labels = []

  with torch.no_grad():
      for iter, (bg, label) in enumerate(dataset_loader):
          prediction = function_model(bg)
          probs_Y = torch.softmax(prediction, 1)
          argmax_Y = torch.max(probs_Y, 1)[1].view(-1, 1)
          predictions.append(argmax_Y)
          labels.append(label)

  argmax_Y = torch.cat(predictions, dim=0)
  test_Y = torch.cat(labels, dim=0)

  print(f"{name}: {accuracy_score(argmax_Y, test_Y)}")

  return

time: 2.18 ms (started: 2023-10-12 07:15:18 +00:00)


In [9]:
models = []

for hidden_layer in [number_of_layers]:
  for hidden_dim in [size_of_layer]:
    model = Classifier(13, hidden_dim, 2, hidden_layer)
    models.append(model)
# Create model
# model = Classifier(13, 256, 2)
# use pretrained model
# model.load_state_dict(torch.load(f"{drive_path}model.pth"))

time: 39.2 ms (started: 2023-10-12 07:15:22 +00:00)


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(preds: torch.Tensor, labels: torch.Tensor, threshold: float = 0.5):
  is_multiclass = labels.max().item() > 1
  if is_multiclass:
      preds = torch.argmax(preds, dim=-1)
      probs = preds.tolist()  # Predicted class not raw probs
  else:
      probs = preds.tolist()
      preds = (preds > threshold).float()

  return {
      'accuracy': accuracy_score(preds, labels),
      'precision': precision_score(preds, labels, average='micro' if is_multiclass else 'binary'),
      'recall': recall_score(preds, labels, average='micro' if is_multiclass else 'binary'),
      'F1 micro': f1_score(preds, labels, average='micro'),
      'F1 macro': f1_score(preds, labels, average='macro'),
      'F1': f1_score(preds, labels, average='binary'),
      'CM': confusion_matrix(preds, labels)
  }

#torch.save(model.state_dict(), "model.pth")
def print_metrics(model, loader):
  predictions = []
  labels = []

  with torch.no_grad():
      model.eval()
      for iter, (bg, label) in enumerate(loader):
          prediction = model(bg)
          probs_Y = torch.softmax(prediction, 1)
          argmax_Y = torch.max(probs_Y, 1)[1].view(-1, 1)
          predictions.append(argmax_Y)
          labels.append(label)
      model.train()

  argmax_Y = torch.cat(predictions, dim=0)
  test_Y = torch.cat(labels, dim=0)

  metrics = compute_metrics(argmax_Y, test_Y)
  for metric, value in metrics.items():
    print(f"{metric}: {value}")

time: 6.98 ms (started: 2023-10-12 07:15:39 +00:00)


In [11]:
for model in models:
  model.print_params()
  loss_func = nn.CrossEntropyLoss()
  optimizer = optim.Adam(model.parameters(), lr=0.001)

  # prepares for training
  model.train()
  epoch_losses = []
  best_validation_loss = 100000000000
  epochs_without_gain = 0
  end_training = False

  for epoch in tqdm(range(epochs)):
      epoch_loss = 0
      for i in enumerate(data_loader):
          iter, (bg, label) = i
          prediction = model(bg)
          loss = loss_func(prediction, label)
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          epoch_loss += loss.detach().item()

      model.eval()

      epoch_loss /= (iter + 1)

      validation_loss = 0
      for iter, (bg, label) in enumerate(validation_loader):
          prediction = model(bg)
          validation_loss += loss_func(prediction, label).detach().item()

      validation_loss /= (iter + 1)

      if validation_loss < best_validation_loss and epoch >= 30:
        epochs_without_gain = 0
        best_validation_loss = validation_loss
        eval_model(model, data_loader, "train")
        eval_model(model, validation_loader, "validation")
        eval_model(model, test_loader, "test")
        torch.save(model.state_dict(), f"{drive_path}{attack_name}_model{model.hidden_dim}_{model.hidden_layer}.pth")
        print_metrics(model, test_loader)
      #elif epoch >= 30:
      #  epochs_without_gain += 1
      #  if epochs_without_gain > 5:
      #    end_training = True

      model.train()

      print('Epoch {}, loss {:.4f}'.format(epoch, epoch_loss))
      epoch_losses.append(epoch_loss)

      if end_training:
        break

3 hidden layers. 256 hidden dim.


  2%|▏         | 1/50 [02:25<1:59:07, 145.88s/it]

Epoch 0, loss 0.6939


  4%|▍         | 2/50 [04:51<1:56:21, 145.46s/it]

Epoch 1, loss 0.6938


  6%|▌         | 3/50 [07:18<1:54:43, 146.46s/it]

Epoch 2, loss 0.6936


  8%|▊         | 4/50 [09:40<1:50:44, 144.44s/it]

Epoch 3, loss 0.6937


 10%|█         | 5/50 [12:04<1:48:13, 144.31s/it]

Epoch 4, loss 0.6932


 12%|█▏        | 6/50 [14:25<1:45:07, 143.34s/it]

Epoch 5, loss 0.6938


 14%|█▍        | 7/50 [16:51<1:43:16, 144.10s/it]

Epoch 6, loss 0.6930


 16%|█▌        | 8/50 [19:16<1:41:10, 144.53s/it]

Epoch 7, loss 0.6932


 18%|█▊        | 9/50 [21:39<1:38:26, 144.06s/it]

Epoch 8, loss 0.6934


 20%|██        | 10/50 [24:04<1:36:11, 144.29s/it]

Epoch 9, loss 0.6927


 22%|██▏       | 11/50 [26:28<1:33:43, 144.20s/it]

Epoch 10, loss 0.6934


 24%|██▍       | 12/50 [28:54<1:31:42, 144.81s/it]

Epoch 11, loss 0.6933


 26%|██▌       | 13/50 [31:19<1:29:18, 144.83s/it]

Epoch 12, loss 0.6932


 28%|██▊       | 14/50 [33:48<1:27:37, 146.05s/it]

Epoch 13, loss 0.6933


 30%|███       | 15/50 [36:14<1:25:11, 146.04s/it]

Epoch 14, loss 0.6933


 32%|███▏      | 16/50 [38:41<1:22:54, 146.30s/it]

Epoch 15, loss 0.6925


 34%|███▍      | 17/50 [41:09<1:20:43, 146.79s/it]

Epoch 16, loss 0.6934


 36%|███▌      | 18/50 [43:35<1:18:14, 146.71s/it]

Epoch 17, loss 0.6928


 38%|███▊      | 19/50 [46:07<1:16:35, 148.25s/it]

Epoch 18, loss 0.6930


 40%|████      | 20/50 [48:40<1:14:51, 149.70s/it]

Epoch 19, loss 0.6932


 42%|████▏     | 21/50 [51:14<1:13:00, 151.05s/it]

Epoch 20, loss 0.6930


 44%|████▍     | 22/50 [53:48<1:10:47, 151.69s/it]

Epoch 21, loss 0.6928


 46%|████▌     | 23/50 [56:26<1:09:06, 153.58s/it]

Epoch 22, loss 0.6926


 48%|████▊     | 24/50 [59:00<1:06:42, 153.93s/it]

Epoch 23, loss 0.6933


 50%|█████     | 25/50 [1:01:33<1:03:59, 153.58s/it]

Epoch 24, loss 0.6926


 52%|█████▏    | 26/50 [1:04:09<1:01:39, 154.13s/it]

Epoch 25, loss 0.6924


 54%|█████▍    | 27/50 [1:06:42<58:57, 153.80s/it]  

Epoch 26, loss 0.6924


 56%|█████▌    | 28/50 [1:09:17<56:32, 154.21s/it]

Epoch 27, loss 0.6924


 58%|█████▊    | 29/50 [1:11:53<54:09, 154.73s/it]

Epoch 28, loss 0.6924


 60%|██████    | 30/50 [1:14:29<51:41, 155.08s/it]

Epoch 29, loss 0.6929
train: 0.510625
validation: 0.4775
test: 0.495


 62%|██████▏   | 31/50 [1:18:29<57:12, 180.68s/it]

accuracy: 0.495
precision: 0.5025906735751295
recall: 0.47783251231527096
F1 micro: 0.495
F1 macro: 0.494949494949495
F1: 0.48989898989898994
CM: [[101  96]
 [106  97]]
Epoch 30, loss 0.6929


 64%|██████▍   | 32/50 [1:21:07<52:07, 173.74s/it]

Epoch 31, loss 0.6925


 66%|██████▌   | 33/50 [1:23:43<47:45, 168.58s/it]

Epoch 32, loss 0.6927


 68%|██████▊   | 34/50 [1:26:24<44:20, 166.30s/it]

Epoch 33, loss 0.6923


 70%|███████   | 35/50 [1:29:01<40:51, 163.43s/it]

Epoch 34, loss 0.6928


 72%|███████▏  | 36/50 [1:31:40<37:51, 162.22s/it]

Epoch 35, loss 0.6926


 74%|███████▍  | 37/50 [1:34:20<34:58, 161.46s/it]

Epoch 36, loss 0.6924


 76%|███████▌  | 38/50 [1:37:00<32:12, 161.03s/it]

Epoch 37, loss 0.6929


 78%|███████▊  | 39/50 [1:39:37<29:17, 159.76s/it]

Epoch 38, loss 0.6927


 80%|████████  | 40/50 [1:42:17<26:38, 159.88s/it]

Epoch 39, loss 0.6924


 82%|████████▏ | 41/50 [1:44:58<24:03, 160.39s/it]

Epoch 40, loss 0.6926


 84%|████████▍ | 42/50 [1:47:36<21:15, 159.49s/it]

Epoch 41, loss 0.6924


 86%|████████▌ | 43/50 [1:50:14<18:33, 159.13s/it]

Epoch 42, loss 0.6924


 88%|████████▊ | 44/50 [1:52:55<15:57, 159.58s/it]

Epoch 43, loss 0.6926


 90%|█████████ | 45/50 [1:55:32<13:14, 158.93s/it]

Epoch 44, loss 0.6925


 92%|█████████▏| 46/50 [1:58:12<10:36, 159.10s/it]

Epoch 45, loss 0.6931


 94%|█████████▍| 47/50 [2:00:50<07:56, 158.95s/it]

Epoch 46, loss 0.6925


 96%|█████████▌| 48/50 [2:03:31<05:18, 159.40s/it]

Epoch 47, loss 0.6919


 98%|█████████▊| 49/50 [2:06:08<02:38, 158.71s/it]

Epoch 48, loss 0.6927


100%|██████████| 50/50 [2:08:44<00:00, 154.49s/it]

Epoch 49, loss 0.6923
time: 2h 8min 44s (started: 2023-10-12 07:15:43 +00:00)





In [None]:
for model in models:
  torch.save(model.state_dict(), f"{drive_path}{attack_name}_model{model.hidden_dim}_{model.hidden_layer}.pth")

time: 11.8 ms (started: 2023-10-09 06:18:00 +00:00)


time: 21.4 ms (started: 2023-10-09 06:18:00 +00:00)
