In [4]:
from library import *
import json
import os
from nodes import *
from utils import *
from collections import *
import itertools
import pandas as pd
import openpyxl
from datetime import datetime

In [5]:
def get_combination_pairs(arr):
    n = len(arr)
    combinations = []
    result = []
    for size in range(1, n + 1):
        for i in range(n - size + 1):
            combinations.append(arr[i:i+size])
    for i in range(len(combinations)):
        for j in range(n):
            if i != j:
                result.append([combinations[i], [arr[j]]])
    return result

In [7]:
path_to_json = '/Users/nghiahoang/Documents/nghiahoang/Summer Research/metagraph/attack-flows'
json_files = [pos_json for pos_json in os.listdir(
    path_to_json) if pos_json.endswith('.json')]
ignore_files = ["JP Morgan Breach.json",
                "FIN13 Case 2.json", "Gootloader.json"]
big_files = ["FIN13 Case 1.json", "CKC.json", "apv.json", "SolarWinds.json", "Conti Ransomware.json", "WhisperGate.json", "IcedID.json", "cdcn.json"]
# # DEFINE NODE CLASS
i = 0
excel_path = 'mainMetagraph.xlsx'

In [8]:
for i in range(len(json_files)):
    file = json_files[i]
    if file in ignore_files or file in big_files or len(file) > 30:
        print('Error: ', file)
        continue
    if file in big_files or len(file) > 30:
        print('Big file: ', file)
        continue
    print(file)
    f = open('attack-flows/'+file)
    data = json.load(f)
    objects = data["objects"]

    # Create nodes from json file

    nodeById, allNodes = convertAllToNodes(objects)

    # Create list for metagraph and add "attack-action", "attack-asset" and "STIX Common Properties" node to the list
    generating_set = set()

    for node in allNodes["actions"]:
        generating_set.add(node)

    # Create metagraph
    mg = Metagraph(generating_set)

    elementsSum = 0
    for k, v in allNodes.items():
        if k != 'operators':
            elementsSum += len(v)

    flow_dict = defaultdict(list)
    
    # set outNodes for each operator
    for node in allNodes["operators"]:
        outnodes = []
        for ref in node.effect_refs:
            for outNode in generating_set:
                if outNode.getId() == ref:
                    outnodes.append(outNode)
            node.setOutNodes(outnodes)

    # Generate edges for the metagraph
    mg, flow_dict = createAttackEdge(mg, generating_set, nodeById, flow_dict)

    mg, flow_dict = createOperatorEdge(mg, allNodes["operators"], flow_dict)

    # from relationships
    mg, flow_dict = createRelationshipEdge(
        mg, allNodes["relationships"], nodeById, flow_dict)

    # metagraph adjacency matrix and incidence
    A = mg.adjacency_matrix()
    I = mg.incidence_matrix()

    combination_pairs = get_combination_pairs(list(generating_set))

    number_of_metapaths = 0
    avg_edge_list = 0
    avg_include_nodes = 0
    metapaths_set = set()
    longest_metapath = float('-inf')
    shortest_metapath = float('inf')
    for pair in combination_pairs:
        if set(pair[0]) != set(pair[1]):
            metapaths = mg.get_all_metapaths_from(set(pair[0]), set(pair[1]))
            if metapaths != None and len(metapaths) > 0:
                metapaths_set.update(metapaths)
                for metapath in metapaths:
                    number_of_metapaths += 1
                    longest_metapath = max(longest_metapath, len(metapath.edge_list))
                    shortest_metapath = min(shortest_metapath, len(metapath.edge_list))
                    included_nodes = set()
                    avg_edge_list += len(metapath.edge_list)
                    for edge in metapath.edge_list:
                        for tmp_node in edge.invertex.union(edge.outvertex, pair[0], pair[1]):
                            included_nodes.add(tmp_node)
                    avg_include_nodes += len(included_nodes)
    number_of_metapaths = len(metapaths_set)
    avg_include_nodes = avg_include_nodes//number_of_metapaths
    avg_edge_list = avg_edge_list//number_of_metapaths
    # print('number of nodes:', len(generating_set))
    # print('number of edges:', len(mg.edges))
    # print('Number of metapaths:', number_of_metapaths)
    # print('Average number of include nodes:', avg_include_nodes)
    # print('Average path length:', avg_edge_list)
    # print('Highest degree nodes:', mg.get_most_degree_nodes())
    # print('Highest rank nodes:', mg.get_most_rank_nodes())
    print(f'Attackflow {file[:-5]} has {len(generating_set)} nodes, {len(mg.edges)} edges, {number_of_metapaths} metapaths (longest: {longest_metapath}, shortest: {shortest_metapath}), the average paths length is {avg_edge_list}, the nodes with highest degree is {mg.get_most_degree_nodes()}, the nodes with highest rank is {mg.get_most_rank_nodes()}.')


Tesla.json
Attackflow Tesla has 6 nodes, 5 edges, 85 metapaths (longest: 3, shortest: 1), the average paths length is 1, the nodes with highest degree is [T1530, T1610, T1496, T1552.001, T1078.004], the nodes with highest rank is [T1530, T1610, T1496, T1552.001, T1078.004].
solarwinds 2.json
Attackflow solarwinds 2 has 8 nodes, 7 edges, 1268 metapaths (longest: 7, shortest: 1), the average paths length is 3, the nodes with highest degree is [T1584, T1136, T1212, T1133, T1606.002, T1534, T1078], the nodes with highest rank is [T1584, T1136, T1212, T1133, T1606.002, T1534, T1078].
Error:  JP Morgan Breach.json
apt-compromise.json
Attackflow apt-compromise has 21 nodes, 16 edges, 42624 metapaths (longest: 10, shortest: 1), the average paths length is 4, the nodes with highest degree is [T1071.004], the nodes with highest rank is [T1071.004].
Equifax Breach.json
Attackflow Equifax Breach has 12 nodes, 11 edges, 14708 metapaths (longest: 10, shortest: 1), the average paths length is 5, the 