In [29]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("Microsoft/microsoft-security-incident-prediction")
# print("Path to dataset files:", path)



In [31]:
# Read aggregate file from money movement threat actors
import json

# Open the JSON file and load its contents
with open('./aggregate_all.json', 'r') as file:
    json_data = json.load(file)

agg_data = dict()
for (k, v) in json_data["Techniques"].items():
    tid = v["id"]
    agg_data[tid] = {"count": v["count"], "name": k}

agg_mit = dict()
for (k, v) in json_data["Mitigations"].items():
    agg_mit[k] = {"count": v["count"]}

# Now 'data' is a Python dictionary (or list, depending on the JSON structure)
top_15 = list(agg_mit.keys())[:15]
print(top_15)

['Execution Prevention', 'Behavior Prevention on Endpoint', 'Privileged Account Management', 'Disable or Remove Feature or Program', 'User Training', 'Restrict Web-Based Content', 'Software Configuration', 'Pre-compromise', 'Antivirus/Antimalware', 'Exploit Protection', 'Network Intrusion Prevention', 'Data Backup', 'Code Signing', 'Operating System Configuration', 'Restrict File and Directory Permissions']


In [32]:
import os
train_path = os.path.join(path, "GUIDE_Train.csv")
test_path = os.path.join(path, "GUIDE_Test.csv")

train_counts = {}
test_counts = {}
for tech in agg_data:
    train_counts[tech] = {"count": 0, "name": agg_data[tech]["name"]}
    test_counts[tech] = {"count": 0, "name": agg_data[tech]["name"]}

chunk_size = 10000  # Adjust depending on your available memory

train_chunks = pd.read_csv(train_path, chunksize=chunk_size)
test_chunks = pd.read_csv(test_path, chunksize=chunk_size)

for (chunks, msft_counts, label) in [(train_chunks, train_counts, "TRAIN"), (test_chunks, test_counts, "TEST")]:
    for df in chunks:
        df['MitreTechniques'] = df['MitreTechniques'].astype('category')
        # Filter for rows where IncidentGrade is 'TruePositive'
        df = df[df['IncidentGrade'] == 'TruePositive']
        # Drop rows where MitreTechniques is null
        df = df.dropna(subset=['MitreTechniques'])
    
        # Filter for rows where MitreTechniques contains any value from the 'agg_data' list
        filtered_df = df[df['MitreTechniques'].apply(lambda x: any(technique in x for technique in agg_data.keys()))]

        for tech in agg_data:
            # Create the regex pattern to match the exact technique code (e.g., T1008)
            pattern = r'\b' + tech + r'\b'
        
            # Count rows where MitreTechniques contains 'T1008'
            count = df['MitreTechniques'].str.contains(pattern).sum()
            msft_counts[tech]["count"] += int(count)

print(train_counts)
print(test_counts)

{'T1588': {'count': 2, 'name': 'Obtain Capabilities'}, 'T1588.002': {'count': 0, 'name': 'Tool'}, 'T1566': {'count': 474121, 'name': 'Phishing'}, 'T1566.001': {'count': 39127, 'name': 'Spearphishing Attachment'}, 'T1204': {'count': 2902, 'name': 'User Execution'}, 'T1204.002': {'count': 1445, 'name': 'Malicious File'}, 'T1543': {'count': 2317, 'name': 'Create or Modify System Process'}, 'T1543.003': {'count': 2317, 'name': 'Windows Service'}, 'T1027': {'count': 1185, 'name': 'Obfuscated Files or Information'}, 'T1218': {'count': 790, 'name': 'System Binary Proxy Execution'}, 'T1059': {'count': 5850, 'name': 'Command and Scripting Interpreter'}, 'T1059.001': {'count': 4261, 'name': 'PowerShell'}, 'T1059.003': {'count': 951, 'name': 'Windows Command Shell'}, 'T1059.005': {'count': 612, 'name': 'Visual Basic'}, 'T1053': {'count': 316, 'name': 'Scheduled Task/Job'}, 'T1053.005': {'count': 225, 'name': 'Scheduled Task'}, 'T1562': {'count': 280, 'name': 'Impair Defenses'}, 'T1562.004': {'cou

In [33]:
import json
import scipy.stats as stats
import itertools
from collections import OrderedDict

def spearmans_rank_correlation(order, label1, list1, label2, list2):
    ranked_list1 = [order.index(x) + 1 for x in list1]
    ranked_list2 = [order.index(x) + 1 for x in list2]
    # print(ranked_list1)
    # print(ranked_list2)

    # Compute Spearman's rank correlation
    correlation, p_value = stats.spearmanr(ranked_list1, ranked_list2)
    
    # Print the result
    print(f"Comparing {label1} and {label2}")
    print(f"Spearman's rank correlation: {correlation}")
    print(f"P-value: {p_value}")

# sort MSIP techniques by descending 
sortd = OrderedDict(sorted(train_counts.items(), key=lambda item: item[1]['count'], reverse=True))
train_counts = sortd

sortd = OrderedDict(sorted(test_counts.items(), key=lambda item: item[1]['count'], reverse=True))
test_counts = sortd

# comparing techniques (all)
technique_order = list(agg_data.keys())
to_compare = [("Money Movement Actors", agg_data), ("MSIP train", train_counts), ("MSIP test", test_counts)]

print("Comparing Techniques (all)")

for pair in itertools.combinations(to_compare, 2):
    ((label1, data1), (label2, data2)) = pair
    # Convert strings to ranks
    list1 = list(data1.keys())
    list2 = list(data2.keys())
    spearmans_rank_correlation(technique_order, label1, list1, label2, list2)

print("\nComparing Techniques (TOP 15)")
TOP = 15
for pair in itertools.combinations(to_compare, 2):
    ((label1, data1), (label2, data2)) = pair
    # Convert strings to ranks
    list1 = list(data1.keys())[:TOP]
    list2 = list(data2.keys())[:TOP]
    spearmans_rank_correlation(technique_order, label1, list1, label2, list2)


Comparing Techniques (all)
Comparing Money Movement Actors and MSIP train
Spearman's rank correlation: 0.4041386889684103
P-value: 2.2111537552360217e-07
Comparing Money Movement Actors and MSIP test
Spearman's rank correlation: 0.41786954016056177
P-value: 7.686738854877323e-08
Comparing MSIP train and MSIP test
Spearman's rank correlation: 0.3987307841797006
P-value: 3.309608302218478e-07

Comparing Techniques (TOP 15)
Comparing Money Movement Actors and MSIP train
Spearman's rank correlation: -0.09999999999999999
P-value: 0.7228973252791182
Comparing Money Movement Actors and MSIP test
Spearman's rank correlation: -0.035714285714285705
P-value: 0.8994469934720875
Comparing MSIP train and MSIP test
Spearman's rank correlation: 0.6357142857142856
P-value: 0.0108607461687898


In [34]:
with open("./microsoft-security-incident-prediction_v1/train_mit_count.json", "r") as f:
    train_mit = json.load(f)

with open("./microsoft-security-incident-prediction_v1/test_mit_count.json", "r") as f:
    test_mit = json.load(f)

sortd = OrderedDict(sorted(train_mit.items(), key=lambda item: item[1]['count'], reverse=True))
train_mit = sortd

sortd = OrderedDict(sorted(test_mit.items(), key=lambda item: item[1]['count'], reverse=True))
test_mit = sortd

# comparing mitigations (all)
mitigation_order = list(agg_mit.keys())

to_compare = [("Money Movement Actors", agg_mit), ("MSIP train", train_mit), ("MSIP test", test_mit)]

print("Comparing Mitigations (all)")
for pair in itertools.combinations(to_compare, 2):
    ((label1, data1), (label2, data2)) = pair
    # Convert strings to ranks
    list1 = list(data1.keys())
    list2 = list(data2.keys())
    spearmans_rank_correlation(mitigation_order, label1, list1, label2, list2)

print("\nComparing Mitigations (TOP 15)")
TOP = 15
for pair in itertools.combinations(to_compare, 2):
    ((label1, data1), (label2, data2)) = pair
    # Convert strings to ranks
    list1 = list(data1.keys())[:TOP]
    list2 = list(data2.keys())[:TOP]
    spearmans_rank_correlation(mitigation_order, label1, list1, label2, list2)

for pair in itertools.combinations(to_compare, 2):
    ((label1, data1), (label2, data2)) = pair
    list1 = list(data1.keys())[:TOP]
    list2 = list(data2.keys())[:TOP]
    print(label1, list1)
    print(label2, list2)


Comparing Mitigations (all)
Comparing Money Movement Actors and MSIP train
Spearman's rank correlation: 0.3259838786154575
P-value: 0.04895524296243798
Comparing Money Movement Actors and MSIP test
Spearman's rank correlation: 0.31460407776197247
P-value: 0.0579004463432323
Comparing MSIP train and MSIP test
Spearman's rank correlation: 0.6993835941204362
P-value: 1.4651328416253839e-06

Comparing Mitigations (TOP 15)
Comparing Money Movement Actors and MSIP train
Spearman's rank correlation: -0.28928571428571426
P-value: 0.29566536100437785
Comparing Money Movement Actors and MSIP test
Spearman's rank correlation: -0.28571428571428564
P-value: 0.3019363513225493
Comparing MSIP train and MSIP test
Spearman's rank correlation: 0.9964285714285712
P-value: 2.4159793199703526e-15
Money Movement Actors ['Execution Prevention', 'Behavior Prevention on Endpoint', 'Privileged Account Management', 'Disable or Remove Feature or Program', 'User Training', 'Restrict Web-Based Content', 'Software C

In [35]:
print(len(agg_data.keys()))
print(len(agg_mit.keys()))

153
37
