In [None]:
import networkx as nx
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os
import glob
from collections import Counter, defaultdict

In [None]:
directory = "" # Your path to raw data here
graph_files = glob.glob(os.path.join(directory, "graph_*.pickle"))
graphs = {}
for file in graph_files:
    with open(file, "rb") as f:
        G = pickle.load(f)
        if isinstance(G, nx.Graph):  
            graphs[file] = G
        else:
            print(f"Warning: {file} is not a valid NetworkX graph.")

print(f"Loaded {len(graphs)} graphs successfully!")

In [None]:
# Start by printing some stats about the dataset
#
# number of nodes, number of edges
# density -> 2*|E|/(|V|*(|V|-1))
# degrees

graph_stats = []
for name, G in graphs.items():
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    density = nx.density(G)
    degrees = [d for _, d in G.degree()]
    
    graph_stats.append({
        "graph_name": name,
        "num_nodes": num_nodes,
        "num_edges": num_edges,
        "density": density,
        "avg_degree": np.mean(degrees) if degrees else 0,
        "median_degree": np.median(degrees) if degrees else 0,
        "max_degree": max(degrees) if degrees else 0
    })

df_stats = pd.DataFrame(graph_stats)
df_stats.describe()

In [None]:
# Distribution of edges count accross graphs

def plot_num_edges_distribution(df_stats, 
                                bins=25, 
                                xlabel="Number of Edges", 
                                ylabel="Frequency", 
                                title="Distribution of Number of Edges Across Graphs"):
    
    """
    Plot the distribution of the number of edges in graphs.
    """
    
    plt.figure(figsize=(6, 5))
    ax = sns.histplot(df_stats["num_edges"], bins=bins, kde=True, alpha=.5, edgecolor="white")
    plt.xlabel(xlabel, fontsize=10)
    plt.ylabel(ylabel, fontsize=10)
    plt.title(title, fontsize=10)
    plt.grid(color='black', linestyle='-', linewidth=.5, alpha=.3)
    plt.box(False)
    plt.savefig('EDA/edgecount_distribution.jpeg', 
                dpi=400, 
                bbox_inches='tight', 
                transparent=True)
    plt.tight_layout()
    plt.show()


plot_num_edges_distribution(df_stats)

In [None]:
def plot_edge_multiplicity_distribution(graphs, 
                                        xlabel="Number of Parallel Edges", 
                                        ylabel="Frequency", 
                                        title="Edge Multiplicity Distribution Across Graphs"):
    """
    Plot the distribution of edge multiplicities (parallel edges) across graphs.
    """

    edge_multiplicities = []
    for G in graphs.values():
        for u, v, data in G.edges(data=True):
            edge_multiplicities.append(G.number_of_edges(u, v))  # Count multiple edges

    multiplicity_counts = Counter(edge_multiplicities)
    plt.figure(figsize=(6, 5))
    plt.bar(multiplicity_counts.keys(), multiplicity_counts.values(), edgecolor="white", alpha=0.5)
    plt.yscale("log")
    plt.xlabel(xlabel, fontsize=10)
    plt.ylabel(ylabel, fontsize=10)
    plt.title(title, fontsize=10)
    plt.grid(color='black', linestyle='-', linewidth=.5, alpha=.3)
    plt.box(False)
    plt.savefig('EDA/edge_multiplicity_distribution.jpeg', dpi=400, bbox_inches='tight', transparent=True)
    plt.tight_layout()
    plt.show()

plot_edge_multiplicity_distribution(graphs)

In [None]:
# Correaltion heatmap showing which edges are commonly found parallel

def plot_edge_type_correlation_heatmap(graphs):
    
    """
    Plot a heatmap showing correlation of edge types found together between the same (u, v)
    """
    
    edge_pair_counts = defaultdict(lambda: defaultdict(int))
    
    for G in graphs.values():
        for u, v, data in G.edges(data=True):
            edge_type = data.get("type_", "Unknown")
            edge_pair_counts[(u, v)][edge_type] += 1

    edge_type_corr_matrix = defaultdict(lambda: defaultdict(int))
    for edge_pair, types_count in edge_pair_counts.items():
        types = list(types_count.keys())
        if len(types) > 1:  
            for i in range(len(types)):
                for j in range(i + 1, len(types)):
                    edge_type_corr_matrix[types[i]][types[j]] += 1
                    edge_type_corr_matrix[types[j]][types[i]] += 1  

    edge_types = list(edge_type_corr_matrix.keys())
    
    corr_matrix = np.zeros((len(edge_types), len(edge_types)))
    for i, edge_type_i in enumerate(edge_types):
        for j, edge_type_j in enumerate(edge_types):
            corr_matrix[i, j] = edge_type_corr_matrix[edge_type_i].get(edge_type_j, 0)

    corr_df = pd.DataFrame(corr_matrix, index=edge_types, columns=edge_types)
    corr_df = corr_df.rename(index={"optimal": "Path"}, columns={"optimal": "Path"})  # Rename for clarity if needed

    plt.figure(figsize=(6,5))
    sns.heatmap(corr_df, 
                annot=False, 
                cmap="Blues", 
                linewidths=.5, 
                cbar_kws={'label': 'Pair Frequency'})
    
    plt.title("Correlation of Edge Types", fontsize=10)
    plt.xlabel("Edge Type", fontsize=10)
    plt.ylabel("Edge Type", fontsize=10)
    plt.xticks(fontsize=8, rotation=90)
    plt.yticks(fontsize=8)
    plt.savefig('EDA/edge_type_correlation.jpeg', dpi=400, bbox_inches='tight', transparent=True)
    plt.tight_layout()
    plt.show()

plot_edge_type_correlation_heatmap(graphs)

In [None]:
# PLot counts for : OS Types, Node Types, Properties, Edge Type

node_label_counts = Counter()
os_counts = Counter()
property_counts = Counter()
edge_type_counts = Counter()

label_combinations = [['Base', 'Computer'], ['Base', 'OU'], ['Base', 'User'], ['Base', 'Group'], 
                      ['Base', 'GPO'], ['Base', 'Domain']]
operating_systems = ["Windows Server 2003", "Windows Server 2008", "Windows 7", "Windows 10", 
                     "Windows XP", "Windows Server 2012", "Windows Server 2008"]
properties = ["enabled", "hasspn", "highvalue", "is_vulnerable", "target", "owned"]

for G in graphs.values():
    for node, data in G.nodes(data=True):
        for label_combination in label_combinations:
            if all(item in data.get("labels", []) for item in label_combination):
                node_label_counts[str(label_combination)] += 1
        os_value = data.get('properties', {}).get('operatingsystem', None)
        if os_value in operating_systems:
            os_counts[os_value] += 1      
        for prop in properties:
            if data.get('properties', {}).get(prop, False):
                property_counts[prop] += 1
    for _, _, data in G.edges(data=True):
        edge_type = data.get("type_", "Unknown")  
        edge_type_counts[edge_type] += 1


def plot_histograms_in_quadrant_percentage_only(counts_list, titles, xlabels, rotation=90):
    fig, axes = plt.subplots(2, 2, figsize=(12, 12))
    axes = axes.flatten()  # Flatten the axes array for easier indexing
    
    for i, (counts, title, xlabel) in enumerate(zip(counts_list, titles, xlabels)):
        total = sum(counts.values())
        
        percentages = {key: (value / total) * 100 for key, value in counts.items()}
        
        sorted_counts = dict(sorted(counts.items(), key=lambda item: item[1], reverse=True))
        sorted_percentages = {key: percentages[key] for key in sorted_counts}

        sns.barplot(x=list(sorted_counts.keys()), y=list(sorted_counts.values()), ax=axes[i], alpha=.5)
        
        if i == 0:  
            new_labels = [label.split(',')[-1].strip("[]") for label in sorted_counts.keys()]
            axes[i].set_xticklabels(new_labels, rotation=rotation, fontsize=8)
        
        max_y_value = max(sorted_counts.values())
        
        if i == 3:
            num_bars = len(sorted_counts)
            for j in range(num_bars-20, num_bars):
                percentage = sorted_percentages[list(sorted_counts.keys())[j]]
                x_position = j  # Position of the bar
                
                y_position = sorted_counts[list(sorted_counts.keys())[j]] + max_y_value * 0.05
                
                axes[i].text(x_position, y_position, f'{percentage:.4f}%', ha='center', fontsize=8, rotation=70)

        for idx, (key, count) in enumerate(sorted_counts.items()):
            percentage = sorted_percentages[key]
            
            if i != 3 or (i == 3 and idx < num_bars-20):
                # Calculate annotation offset based on the max y-value
                annotation_offset = max_y_value * 0.015  # 5% of the maximum y-value
                y_position = count + annotation_offset  # Adjust the vertical position of annotations
                axes[i].text(idx, y_position, f'{percentage:.2f}%', ha='left', fontsize=8)

        axes[i].set_xlabel(xlabel, fontsize=10)
        axes[i].set_ylabel("Count", fontsize=10)
        axes[i].set_title(title, fontsize=10)
        axes[i].tick_params(axis="x", rotation=rotation, labelsize=8)
        axes[i].tick_params(axis="y", labelsize=8)
    
    plt.savefig('EDA/quadrant_properties.jpeg', dpi=400, bbox_inches='tight', transparent=True)
    plt.tight_layout()
    plt.show()

counts_list = [node_label_counts, os_counts, property_counts, edge_type_counts]
titles = ["Node Type Distribution", "Operating System Distribution", "Node Properties Distribution", "Edge Type Distribution"]
xlabels = ["Node Type", "OS Type", "Properties", "Edge Type"]
plot_histograms_in_quadrant_percentage_only(counts_list, titles, xlabels)

In [None]:
edge_types = ["AdminTo", "AllowedToDelegate", "CanRDP", "Contains", "DCSync", "ExecuteDCOM", 
              "GenericAll", "GetChanges", "GetChangesAll", "GpLink", "HasSession", "MemberOf", 
              "Open", "Owns", "WriteDacl", "WriteOwner"]

edge_type_counts = defaultdict(list)
for graph_name, G in graphs.items():
    edge_count = defaultdict(int)  
    for _, _, data in G.edges(data=True):
        edge_type = data.get("type_", "Unknown")  
        edge_count[edge_type] += 1
    
    for edge_type, count in edge_count.items():
        edge_type_counts[edge_type].append(count)

num_edge_types = len(edge_type_counts)
n_cols = 3
n_rows = (num_edge_types + n_cols - 1) // n_cols 
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 4 * n_rows))
axes = axes.flatten()
for idx, (edge_type, counts) in enumerate(edge_type_counts.items()):
    sns.histplot(counts, kde=True, bins=15, ax=axes[idx], alpha=.5, edgecolor='white')
    #axes[idx].set_title(f'{edge_type}')
    axes[idx].set_xlabel(f'{edge_type} Edges')
    axes[idx].set_ylabel("Graphs")

for i in range(num_edge_types, len(axes)):
    axes[i].axis('off')

plt.savefig('EDA/distribution_edge_type.jpeg', dpi=400, bbox_inches='tight', transparent=True)
plt.tight_layout()
plt.show()

In [None]:
# Plot the attack path lenght distribution

path_lengths = []
for graph_name, G in graphs.items():
    path_edges = [data for _, _, data in G.edges(data=True) if data.get("type_") == "optimal"]
    path_lengths.append(len(path_edges))

sns.histplot(path_lengths, kde=True, bins=25, edgecolor='white',alpha=.5)
plt.title("Distribution of Attack Path Lengths")
plt.xlabel("Attack Path Length")
plt.ylabel("Graphs")
plt.savefig('EDA/path_lenght_distribution.jpeg', dpi=400, bbox_inches='tight', transparent=True)
plt.tight_layout()
plt.show()

In [None]:
# Compare the degree of nodes in attack path with degree of nodes outside attack path

path_node_degrees = []
all_node_degrees = []

for graph_name, G in graphs.items():
    path_nodes = [u for u, _, data in G.edges(data=True) if data.get("type_") == "optimal"]
    for node in path_nodes:
        path_node_degrees.append(G.degree(node))
    for node in G.nodes():
        all_node_degrees.append(G.degree(node))

path_node_degrees = [degree for degree in path_node_degrees if degree <= 200]
all_node_degrees = [degree for degree in all_node_degrees if degree <= 200]

plt.figure(figsize=(10, 6))
sns.histplot(path_node_degrees, color='red', kde=True, bins=25, label="In path", alpha=0.25, edgecolor="white", stat="density")
sns.histplot(all_node_degrees, kde=True, label="In graph", bins=25, alpha=0.5, edgecolor="white", stat="density")
plt.title("Node Degree Distribution: Optimal Path vs All Nodes")
plt.xlabel("Node Degree")
plt.ylabel("Density")
plt.savefig('EDA/path_degree_vs_graph_degree_distribution.jpeg', dpi=400, bbox_inches='tight', transparent=True)
plt.legend()
plt.show()