In [2]:
# Dependencies
!pip install torch_geometric
!pip install rdkit

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1
Collecting rdkit
  Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.3.6-cp310-cp310-manylinux_2_28_x86_64.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.6


In [3]:
# Imports
import torch
from torch_geometric.data import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
import torch.nn.functional as F
import pandas as pd
import random as rd
import os
from rdkit import Chem
from rdkit.Chem import rdmolops
from google.colab import drive
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data
from sklearn.metrics import accuracy_score
import itertools

In [4]:
drive.mount('/content/drive')

Mounted at /content/drive


## Load the data
This code cell connects to my previously connected private google drive and loads 1000 benign function call graphs and 1000 malicious ones from the artemis family, labels them accordingly and adds them to a list

In [5]:
def read_edgelist_to_graph(filepath, label):
    try:
        edge_index = []
        with open(filepath, 'r') as f:
            for line in f:
                if line.startswith('#'):
                    continue
                source, target = map(int, line.strip().split())
                edge_index.append([source, target])

        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        x = torch.ones(edge_index.max().item() + 1, 1, dtype=torch.float)
        data = Data(x=x, edge_index=edge_index, y=label)
        return data
    except Exception as e:
        print(f"Error reading file {filepath}: {e}")
        return None

benign_dir = '/content/drive/My Drive/malnet/benign/'
artemis_dir = '/content/drive/My Drive/malnet/artemis/'

data_list = []

# Load graphs from the 'benign' directory
for filename in os.listdir(benign_dir):
    if filename.endswith(".edgelist"):
        filepath = os.path.join(benign_dir, filename)
        # label 0 is non malicious
        graph_data = read_edgelist_to_graph(filepath, 0)
        if graph_data:
            data_list.append(graph_data)
# Load graphs from the 'artemis' directory
for filename in os.listdir(artemis_dir):
    if filename.endswith(".edgelist"):
        filepath = os.path.join(artemis_dir, filename)
        # label 1 is malicious
        graph_data = read_edgelist_to_graph(filepath, 1)
        if graph_data:
            data_list.append(graph_data)

print(f"Loaded {len(data_list)} graphs.")


Loaded 2000 graphs.


## Define a data loader

In [6]:
# Shuffle the data list
rd.shuffle(data_list)

# Define the split ratios
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# Calculate the split indices
train_size = int(len(data_list) * train_ratio)
val_size = int(len(data_list) * val_ratio)
test_size = len(data_list) - train_size - val_size

# Split the data
train_data = data_list[:train_size]
val_data = data_list[train_size:train_size + val_size]
test_data = data_list[train_size + val_size:]

# Create data loaders
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

print(f"Train size: {len(train_data)}")
print(f"Validation size: {len(val_data)}")
print(f"Test size: {len(test_data)}")

Train size: 1600
Validation size: 200
Test size: 200


## Evaluate different hyperparameter combination using gridsearch

In [7]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_units, num_layers, dropout):
        super(GNN, self).__init__()

        self.num_layers = num_layers
        self.hidden_units = hidden_units

        self.layers = torch.nn.ModuleList()
        self.layers.append(GCNConv(1, hidden_units))

        for _ in range(num_layers - 1):
            self.layers.append(GCNConv(hidden_units, hidden_units))

        self.fc = torch.nn.Linear(hidden_units, 2)
        self.dropout = dropout

    def forward(self, data):
        edge_index = data.edge_index
        x = torch.zeros((data.num_nodes, 1)).to(edge_index.device)

        for i in range(self.num_layers):
            x = F.relu(self.layers[i](x, edge_index))
            x = F.dropout(x, p=self.dropout, training=self.training)

        # pool all nodes to perform graph level classification
        x = global_mean_pool(x, data.batch)
        x = self.fc(x)
        return x


In [18]:
def train_and_evaluate(model, train_loader, val_loader, optimizer, criterion, device, num_epochs=10):
    model.train()
    best_val_accuracy = 0

    for epoch in range(num_epochs):
        total_loss = 0
        correct = 0
        total = 0

        for data in train_loader:
            data = data.to(device)
            optimizer.zero_grad()
            output = model(data)

            target = data.y

            loss = criterion(output, target)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

            _, predicted = output.max(dim=1)
            correct += predicted.eq(target).sum().item()
            total += data.num_graphs
            """
                CHECK predicted / actual. As of now, the model always predicts 1 for all graphs. (no learning)
            for i in range(data.num_graphs):
                print(f"Predicted: {predicted[i].item()}, Actual: {target[i].item()}")
            """

        train_loss = total_loss / len(train_loader)
        train_accuracy = correct / total
        print(f"Epoch {epoch + 1}/{num_epochs} - Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.4f}")

        val_accuracy = evaluate(model, val_loader, device)
        print(f"Validation Accuracy after Epoch {epoch + 1}: {val_accuracy:.4f}")

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy

    return best_val_accuracy


def evaluate(model, val_loader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for data in val_loader:
            data = data.to(device)
            output = model(data)

            target = data.y

            _, predicted = output.max(dim=1)
            correct += predicted.eq(target).sum().item()
            total += data.num_graphs

    val_accuracy = correct / total
    return val_accuracy


In [19]:
param_grid = {
    "lr": [0.0001, 0.001, 0.01],
    "num_layers": [2, 3, 4],
    "hidden_units": [32, 64],
    "dropout": [0.3, 0.5],
    "weight_decay": [1e-5, 1e-4]
}

best_model = None
best_score = -float('inf')
best_params = {}

# Perform grid search
for lr, num_layers, hidden_units, dropout, weight_decay in itertools.product(
    param_grid['lr'],
    param_grid['num_layers'],
    param_grid['hidden_units'],
    param_grid['dropout'],
    param_grid['weight_decay']
):
    model = GNN(hidden_units=hidden_units, num_layers=num_layers, dropout=dropout)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = torch.nn.CrossEntropyLoss()

    accuracy = train_and_evaluate(model, train_loader, val_loader, optimizer, criterion, device)

    print(f"Hyperparameters: lr={lr}, num_layers={num_layers}, hidden_units={hidden_units}, dropout={dropout}, weight_decay={weight_decay}")
    print(f"Validation Accuracy: {accuracy}")

    # If the current model has a better accuracy, store it
    if accuracy > best_score:
        best_score = accuracy
        best_model = model
        best_params = {
            "lr": lr,
            "num_layers": num_layers,
            "hidden_units": hidden_units,
            "dropout": dropout,
            "weight_decay": weight_decay
        }

# Output the best hyperparameters
print("Best Hyperparameters found:")
print(best_params)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Predicted: 1, Actual: 1
Predicted: 1, Actual: 0
Predicted: 1, Actual: 0
Predicted: 1, Actual: 0
Predicted: 1, Actual: 1
Predicted: 1, Actual: 1
Predicted: 1, Actual: 0
Predicted: 1, Actual: 1
Predicted: 1, Actual: 1
Predicted: 1, Actual: 0
Predicted: 1, Actual: 0
Predicted: 1, Actual: 0
Predicted: 1, Actual: 0
Predicted: 1, Actual: 1
Predicted: 1, Actual: 1
Predicted: 1, Actual: 0
Predicted: 1, Actual: 1
Predicted: 1, Actual: 0
Predicted: 1, Actual: 1
Predicted: 1, Actual: 1
Predicted: 1, Actual: 0
Predicted: 1, Actual: 1
Predicted: 1, Actual: 1
Predicted: 1, Actual: 1
Predicted: 1, Actual: 1
Predicted: 1, Actual: 1
Predicted: 1, Actual: 0
Predicted: 1, Actual: 0
Predicted: 1, Actual: 0
Predicted: 1, Actual: 0
Predicted: 1, Actual: 1
Predicted: 1, Actual: 0
Predicted: 1, Actual: 0
Predicted: 1, Actual: 0
Predicted: 1, Actual: 0
Predicted: 1, Actual: 1
Predicted: 1, Actual: 1
Predicted: 1, Actual: 0
Predicted: 1, Actual: 0

KeyboardInterrupt: 

## Train the Model

## Validation