In [12]:
# This notebook preprocesses the dataset
# The full data (1033 graphs) is about 9Go

In [1]:
import networkx as nx
import pickle
import random
import warnings 
import glob
import os
import torch
warnings.filterwarnings('ignore') 

In [2]:
def _doPreprocess(G):
    # 1. Generate the feature matrix
    def GenerateFeatureMatrixF(G):
        label_combinations = [['Base', 'Computer'], ['Base', 'OU'], ['Base', 'User'], ['Base', 'Group'], 
                              ['Base', 'GPO'], ['Base', 'Domain']]
        operating_systems = ["Windows Server 2003", "Windows Server 2008", "Windows 7", "Windows 10", 
                             "Windows XP", "Windows Server 2012", "Windows Server 2008"]
        properties = ["enabled", "hasspn", "highvalue", "is_vulnerable", "target", "owned"]

        label_to_index = {str(label): idx for idx, label in enumerate(label_combinations)}
        os_to_index = {os: idx for idx, os in enumerate(operating_systems)}
        
        num_nodes = len(G.nodes())
        num_features = len(label_combinations) + len(properties) + len(operating_systems)
        feature_matrix = torch.zeros((num_nodes, num_features), dtype=torch.float32)

        node_id_to_index = {node: idx for idx, node in enumerate(G.nodes())}
        
        # One-hot encoding of node properties
        for node, data in G.nodes(data=True):
            idx = node_id_to_index[node]  # Get node index

            # Encode label combinations
            for label_combination in label_combinations:
                if all(item in data['labels'] for item in label_combination):  
                    feature_matrix[idx, label_to_index[str(label_combination)]] = 1

            # Encode operating system
            os_value = data.get('properties', {}).get('operatingsystem', None)
            if os_value in os_to_index:
                feature_matrix[idx, len(label_combinations) + len(properties) + os_to_index[os_value]] = 1

            # Encode boolean properties
            for prop_idx, prop in enumerate(properties):
                if data.get("properties", {}).get(prop, False):  
                    feature_matrix[idx, len(label_combinations) + prop_idx] = 1

        return feature_matrix

    FeatureMatrix = GenerateFeatureMatrixF(G)

    # 2. Create adjacency tensor for different edge types
    edge_types = ["AdminTo", "AllowedToDelegate", "CanRDP", "Contains", "DCSync", "ExecuteDCOM", 
                  "GenericAll", "GetChanges", "GetChangesAll", "GpLink", "HasSession", "MemberOf", 
                  "Open", "Owns", "WriteDacl", "WriteOwner"]

    num_nodes = len(G.nodes())
    num_edge_types = len(edge_types)
    adj_tensor = torch.zeros((num_nodes, num_nodes, num_edge_types), dtype=torch.float32)
    
    node_id_to_index = {node: idx for idx, node in enumerate(G.nodes())}
    
    for edge_type_idx, edge_type in enumerate(edge_types):
        for u, v, data in G.edges(data=True):
            if data.get('type_') == edge_type:
                u_idx = node_id_to_index[u]  
                v_idx = node_id_to_index[v]  
                adj_tensor[u_idx, v_idx, edge_type_idx] = 1  

    # 3. Generate a target prediction adjacency matrix for each unique "pathId" value on "optimal" edges
    Y_nodes = set()
    X_matrices = []
    Y_matrices = []
    
    # Helper mapping node names to indices
    node_name_to_index = {data['properties']['name']: idx for idx, (node, data) in enumerate(G.nodes(data=True))}
    
    # Identify unique paths based on 'pathId'
    for u, v, data in G.edges(data=True):
        if data.get('type_') == "optimal":
            Y_nodes.add(data['properties']['pathId'])  # Collect unique pathIds
            
    for y_node in Y_nodes:
        y_matrix = torch.zeros((num_nodes, num_nodes), dtype=torch.int64)  # Ensure 2D shape
        for u, v, data in G.edges(data=True):
            if data.get('type_') == "optimal" and data.get('properties', {}).get('pathId') == y_node:
                u_idx = node_id_to_index[u]  
                v_idx = node_id_to_index[v]  
                y_matrix[u_idx, v_idx] = 1  
                
        Y_matrices.append(y_matrix)

    # Generate feature matrix for each "owned" start node
    for y_node in Y_nodes:
        x_matrix = FeatureMatrix.clone()  # Copy the feature matrix

        # Step 1: Reset all "owned" indicators
        x_matrix[:, -1] = 0  

        # Step 2: Find the correct start node and mark it
        y_idx = node_name_to_index.get(y_node, None)
        if y_idx is not None:
            x_matrix[y_idx, -1] = 1  # Mark "owned"

        X_matrices.append(x_matrix)

    adj_tensor = adj_tensor.float()

    # Create a mapping from node index to actual node ID
    index_to_node = {idx: node for idx, node in enumerate(G.nodes())}
    
    return adj_tensor, X_matrices, Y_matrices, list(Y_nodes), edge_types, index_to_node

def load_graph(graph_path):
    with open(graph_path, "rb") as f:
        return pickle.load(f)

def preprocess_graph(G):
    if len(G.nodes()) != 361:
        return None  # Skip processing if node count is incorrect
    
    adj_tensor, X_matrices, Y_matrices, Y_nodes, edge_types, index_to_node = _doPreprocess(G)
    
    if not X_matrices or not Y_matrices:
        return None  # Skip if no valid paths exist
    
    # Randomly select one path
    selected_idx = random.randint(0, len(X_matrices) - 1)
    return adj_tensor, X_matrices[selected_idx], Y_matrices[selected_idx]

def process_dataset(dataset_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    total = 0
    for filename in os.listdir(dataset_dir):
        if filename.endswith(".pickle"):
            graph_path = os.path.join(dataset_dir, filename)
            
            G = load_graph(graph_path)
            processed_data = preprocess_graph(G)
            
            if processed_data is None:
                print(f"Removing invalid graph: {filename}")
                os.remove(graph_path)
                continue
            
            adj_tensor, X_matrix, Y_matrix = processed_data
            
            output_path = os.path.join(output_dir, filename.replace(".pickle", ".pt"))
            torch.save({
                "adj_tensor": adj_tensor,
                "X_matrix": X_matrix,
                "Y_matrix": Y_matrix
            }, output_path)

            total+=1
            print(f"Processed and saved: {output_path}")
            
    print(f"[+] Done, {total} graphs have been preprocessd !")

In [3]:
input_folder = "<insert .pickle here>"
output_folder = "_data_"
process_dataset(input_folder, output_folder)

Processed and saved: _data_/graph_jV3LdG20.pt
Processed and saved: _data_/graph_00hliAZI.pt
Processed and saved: _data_/graph_036Idcma.pt
Processed and saved: _data_/graph_04rUpi8y.pt
Processed and saved: _data_/graph_0drBUl8N.pt
Processed and saved: _data_/graph_0LfRgXU1.pt
Processed and saved: _data_/graph_0Lw4zenj.pt
Processed and saved: _data_/graph_0S7NNh0c.pt
Processed and saved: _data_/graph_0V9GStgY.pt
Processed and saved: _data_/graph_0y42FJQ8.pt
Processed and saved: _data_/graph_17V9fxNz.pt
Processed and saved: _data_/graph_19PxHsXJ.pt
Processed and saved: _data_/graph_1az1V5Ue.pt
Processed and saved: _data_/graph_1BTSm7RP.pt
Processed and saved: _data_/graph_1fBE7jxj.pt
Processed and saved: _data_/graph_1hJozdNw.pt
Processed and saved: _data_/graph_1hyY6WK7.pt
Processed and saved: _data_/graph_1Lxpeuur.pt
Processed and saved: _data_/graph_FkOiHF09.pt
Processed and saved: _data_/graph_FL1lfimk.pt
Processed and saved: _data_/graph_fMrr7sps.pt
Processed and saved: _data_/graph_