In [48]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # matrix construction
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, fbeta_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import multilabel_confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.preprocessing import MultiLabelBinarizer

import pandas as pd
import json
import os
import re

import spacy
import matplotlib
import plotly.express as px
import plotly.subplots as sp
from plotly.subplots import make_subplots
from ast import literal_eval
from tqdm import tqdm

import sklearn.metrics
import numpy as np

In [2]:
tqdm.pandas()

In [3]:
nlp = spacy.load("en_core_web_sm", disable=['parser','tagger', 'parser', 'ner']) 

# Opening Files: 

In [23]:
dataset = "training_dataset_full.csv"
path_data =  "../src/training_dataset_full.csv"

# Converting to Data Frames: 

df = pd.read_csv(path_data).reset_index(drop = True)

In [24]:
# Converting from string to list using literal_eval:

for col in ['mitre_domain', 'tech_name', 'tech_id', 'tactic_id', 'software_id']:
    df[col] = df[col].progress_apply(literal_eval)

100%|██████████| 1520/1520 [00:00<00:00, 39928.74it/s]
100%|██████████| 1520/1520 [00:00<00:00, 20090.45it/s]
100%|██████████| 1520/1520 [00:00<00:00, 17099.77it/s]
100%|██████████| 1520/1520 [00:00<00:00, 31302.31it/s]
100%|██████████| 1520/1520 [00:00<00:00, 43115.96it/s]


In [76]:
# df = df.explode(['tech_id']).reset_index(drop = True)

In [12]:
# Removing non-alphabetical words:

def preprocess(text):
    return re.sub(r'[^a-zA-Z]', ' ', text)

df['text'] = df['text'].progress_apply(preprocess)

100%|██████████| 1520/1520 [00:22<00:00, 67.81it/s]


# Malware Extraction:

In [13]:
# To prevent overfit, we extract all malware names in a list: 

malware_names = []

root_folder = '../data/'
folder_names = ['ics-malware', 'malware']
for folder_name in folder_names: 
    folder = os.path.join(root_folder, folder_name)
    for filename in os.listdir(folder): 
        if filename.endswith('.json'):
            with open(os.path.join(folder, filename)) as file:
                file_json = json.load(file)["objects"][0]
                    
                # retrieve information:
                if 'x_mitre_aliases' in file_json:
                    malware_names.extend(file_json['x_mitre_aliases']) 
               
                malware_names.append(file_json['name'])
                
                pass
            pass
        pass
    pass
pass
                    


In [14]:
malware_names = set(malware_names)
print(malware_names)

{'OSX.Bundlore', 'Homux', 'FALLCHILL', 'NeD Worm', 'Epic', 'GreyEnergy', 'Retefe', '4H RAT', 'BotgenStudios', 'Wipbot', 'SLOWDRIFT', 'Crutch', 'OSX/Keydnap', 'Darkmoon', 'Mespinoza', 'Emissary', 'Black Energy', 'ZxShell', 'BS2005', 'Vasport', 'Felismus', 'RGDoor', 'HTTPBrowser', 'Backdoor.Nidiran', 'KGH_SPY ', 'Kessel', 'RemoteCMD', 'WEBC2', 'Sednit', 'Neoichor', 'Clambling', 'ROCKBOOT', 'Nebulae', 'Penquin_x64', 'HALFBAKED', 'PowerPunch', 'Diavol', 'ASPXTool', 'TinyZBot', 'xCaon', 'TVT', 'Sakurel', 'BabyShark', 'RogueRobin', 'Flame', 'Korplug', 'Cozer', 'BoomBox', 'Zekapab', 'dfls', 'Win32/KillDisk.NBI', 'Seasalt', 'Kazuar', 'POORAIM', 'QakBot', 'BUGJUICE', 'P.A.S. Webshell', 'StrongPity', 'Pony', 'Sasfis', 'HyperSSL', 'MdmBot', 'Linux Rabbit', 'Wiper', 'TajMahal', 'NanoCore', 'QuietSieve', 'StoneDrill', 'GravityRAT', 'SodaMaster', 'BADCALL', 'FLIPSIDE', 'Get2', 'EVILTOSS', 'Carbanak', 'Gazer', 'GuLoader', 'Trojan.Karagany', 'KEYMARBLE', 'Ixeshe', 'Ferocious', 'ThiefQuest', 'Okrum', '

In [15]:
# Replace malware names with generic terms: 
malware_regex = r'(\b' + r'\b|\b'.join(malware_names) + r'\b)'
def replace_malware(text):
    return re.sub(malware_regex, ' MALWARE_NAME ', text, 0, re.IGNORECASE)
df['text'] = df['text'].progress_apply(replace_malware)

100%|██████████| 1520/1520 [07:04<00:00,  3.58it/s] 


In [16]:
# Merge techniques and subtechniques togethere:

def remove_sub_techs(techs):
    for i, t in enumerate(techs):
        if '.' in t:
            techs[i] = t.split('.')[0]
    return techs
df['tech_id'].progress_apply(remove_sub_techs)

100%|██████████| 1520/1520 [00:00<00:00, 280691.33it/s]


0              [T1487, T1485, T1561, T1058, T1574, T1031]
1                            [T1487, T1036, T1485, T1561]
2       [T1487, T1485, T1561, T1059, T1485, T1070, T11...
3                                   [T1487, T1485, T1561]
4                                                 [T1555]
                              ...                        
1515    [T1123, T1547, T1059, T1555, T1025, T1083, T10...
1516    [T1071, T1547, T1059, T1070, T1105, T1027, T1218]
1517                                              [T1573]
1518    [T1071, T1560, T1059, T1140, T1105, T1505, T1033]
1519    [T1071, T1119, T1547, T1115, T1059, T1132, T10...
Name: tech_id, Length: 1520, dtype: object

In [17]:
df.to_csv('merged_dataset_noMalwareNames.csv', index=False)

# Feature Extraction:

In [25]:
# ------------ Count Vectorizer --------------- 

# cv = CountVectorizer(analyzer='word', stop_words='english', lowercase=False,
                        #min_df=0.01) # if words used less than 0.001 % --> ignore  
# data = cv.fit_transform(df_tech['text']) 

# df_dtm = pd.DataFrame(data.toarray(), columns=cv.get_feature_names())

# ---------------- TF-IDF ---------------------: 


tf_idf = TfidfVectorizer(analyzer = 'word', stop_words='english', lowercase=True, min_df=2, max_df=0.99)

data = tf_idf.fit_transform(df['text'])

X = pd.DataFrame(data.toarray(), columns=tf_idf.get_feature_names()) 




Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



In [26]:
# Multi Label Binarizer: 

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df['tactic_id'])
Y = pd.DataFrame(Y, columns=mlb.classes_)

In [8]:
#Y = df['tech_id']

In [27]:
X.head()

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaaaaaaaaaaaaaaaaa,aaaab,aaab,aaac,aaad,...,zzy,zzyx,zzz,zzzb,zzzd,zzzh,zzzj,zzzl,zzzz,zzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.008879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.011719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.063673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.004807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
X.columns

Index(['aa', 'aaa', 'aaaa', 'aaaaa', 'aaaaaa', 'aaaaaaaaaaaaaaaaaaaaaa',
       'aaaab', 'aaab', 'aaac', 'aaad',
       ...
       'zzy', 'zzyx', 'zzz', 'zzzb', 'zzzd', 'zzzh', 'zzzj', 'zzzl', 'zzzz',
       'zzzzzz'],
      dtype='object', length=56375)

In [29]:
y_col_name = 'tactic_id'

In [30]:
df_concat = pd.concat([X, df[y_col_name]], axis=1).explode([y_col_name]).reset_index(drop = True)
df_concat

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaaaaaaaaaaaaaaaaa,aaaab,aaab,aaac,aaad,...,zzyx,zzz,zzzb,zzzd,zzzh,zzzj,zzzl,zzzz,zzzzzz,tactic_id
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TA0003
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TA0004
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TA0005
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TA0040
4,0.008879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TA0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TA0005
4894,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TA0006
4895,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TA0009
4896,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TA0011


In [31]:
df_agg = df_concat.groupby([y_col_name]).mean().reset_index()
y_col = df_agg[y_col_name].unique()[0] # get first tech id

In [32]:
df_agg.loc[df_agg[y_col_name] == y_col_name].iloc[:,1:].transpose().sort_values(by = 0, ascending=False)

KeyError: 0

# Visualisation

In [33]:
# Visualise frequence of words:

def plot_freq_cats(df, y_col_name, top_n=15):
    """Plot the frequent words used in the different categories

    Parameters
    ----------
    df : pd.DataFrame
        Dataset to get the word frequency
    top_n : int, default = 15
        number of most frequent words to show
    """
    nb_cats = df[y_col_name].nunique() # number of tech_names
    fig = make_subplots(rows=nb_cats, cols=1, vertical_spacing=0.02,
                       subplot_titles=["Word frequency for "+ cat for cat in df[y_col_name].unique()])
    
    for i, cat in enumerate(df[y_col_name].unique()):
        df_freq = (df.loc[df[y_col_name] == cat]
                   .iloc[:,1:].transpose().reset_index())
                  
        df_freq.columns = ['words', 'frequency']
        df_freq = (df_freq.sort_values(by = 'frequency', ascending=False)) 

                  
        fig_site = px.bar(df_freq.iloc[:top_n,:],
                      x="words", y="frequency", orientation="v",
                      width=800, height=300, title=f"Word frequency for {cat} ")
        fig.append_trace(fig_site["data"][0], row=i+1, col=1)
    
    fig.update_layout(
    autosize=False,
    width=800,
    height=4400,)
    fig.show()


In [34]:
plot_freq_cats(df_agg.iloc[:10, :], y_col_name)

In [38]:
# Plot the number of articles per technique: 

Y.sum(axis=0).sort_values(ascending=False)#.plot()
"""
TA0005    768
TA0003    590
TA0002    488
TA0011    406
TA0004    403
TA0007    356
TA0008    343
TA0006    306
TA0009    236
TA0001    202
TA0010    123
TA0040     75
dtype: int64
"""

TA0005    822
TA0003    561
TA0004    558
TA0002    499
TA0011    491
TA0007    486
TA0009    343
TA0006    322
TA0040    154
TA0010    141
TA0001    135
TA0008    118
TA0042     55
TA0108     36
TA0043     32
TA0105     28
TA0100     22
TA0107     20
TA0104     12
TA0103     10
TA0102      9
TA0109      9
TA0110      7
TA0106      6
TA0101      5
TA0111      1
dtype: int64

In [39]:
Y1 = Y.loc[:,Y.sum(axis=0)>=80] # only techniques with more than 25 reports
Y = Y1[Y1.sum(axis=1)>0] 
X = X[Y1.sum(axis=1)>0] # all urls who map at least one of the techniques in Y1

In [40]:
Y1[Y1.sum(axis=1) > 0]

Unnamed: 0,TA0001,TA0002,TA0003,TA0004,TA0005,TA0006,TA0007,TA0008,TA0009,TA0010,TA0011,TA0040
0,0,0,1,1,1,0,0,0,0,0,0,1
1,0,0,0,0,1,0,0,0,0,0,0,1
2,0,1,0,1,1,0,1,0,1,0,1,1
3,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1515,1,1,1,1,1,1,1,1,1,0,1,0
1516,0,1,1,1,1,0,0,0,0,0,1,0
1517,0,0,0,0,0,0,0,0,0,0,1,0
1518,0,1,1,0,1,0,1,0,1,0,1,0


# Model: 

In [43]:

# Train and test: First delete techniques less than 9 

# We fix the random state to have the same dataset in our different tests

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,
                                                    random_state = 10)

sv_classifier = OneVsRestClassifier(LinearSVC(penalty = 'l2', loss = 'squared_hinge', dual = False, max_iter = 1000, class_weight = 'balanced'), n_jobs = 1)
sv_classifier.fit(x_train, y_train)



OneVsRestClassifier(estimator=LinearSVC(class_weight='balanced', dual=False),
                    n_jobs=1)

In [55]:
y_pred = pd.DataFrame(sv_classifier.predict(x_train), columns=y_train.columns)

In [50]:
f_score_dict = {}
for col in y_test.columns:
    print(col)
    f_score_dict[col] = fbeta_score(y_train[col], y_pred[col],beta=0.5)
    print(classification_report(y_train[col], y_pred[col]))

TA0001
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       841
           1       0.89      1.00      0.94       104

    accuracy                           0.99       945
   macro avg       0.94      0.99      0.97       945
weighted avg       0.99      0.99      0.99       945

TA0002
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       610
           1       0.94      1.00      0.97       335

    accuracy                           0.98       945
   macro avg       0.97      0.98      0.98       945
weighted avg       0.98      0.98      0.98       945

TA0003
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       559
           1       0.96      0.99      0.97       386

    accuracy                           0.98       945
   macro avg       0.98      0.98      0.98       945
weighted avg       0.98      0.98      0.98       945

In [51]:
y_pred = pd.DataFrame(sv_classifier.predict(x_test), columns=y_test.columns)

In [52]:
f_score_dict = {}
for col in y_test.columns:
    print(col)
    # f beta score: weighted harmonic mean of precision and recall (similar to legoy)
    f_score_dict[col] = fbeta_score(y_test[col], y_pred[col],beta=0.5)
    print(classification_report(y_test[col], y_pred[col]))

TA0001
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       374
           1       0.37      0.35      0.36        31

    accuracy                           0.90       405
   macro avg       0.66      0.65      0.65       405
weighted avg       0.90      0.90      0.90       405

TA0002
              precision    recall  f1-score   support

           0       0.76      0.77      0.77       241
           1       0.66      0.64      0.65       164

    accuracy                           0.72       405
   macro avg       0.71      0.71      0.71       405
weighted avg       0.72      0.72      0.72       405

TA0003
              precision    recall  f1-score   support

           0       0.70      0.71      0.70       230
           1       0.61      0.59      0.60       175

    accuracy                           0.66       405
   macro avg       0.65      0.65      0.65       405
weighted avg       0.66      0.66      0.66       405

In [53]:
f_score_dict

{'TA0001': 0.36423841059602646,
 'TA0002': 0.6529850746268657,
 'TA0003': 0.6053550640279394,
 'TA0004': 0.6473988439306358,
 'TA0005': 0.78515625,
 'TA0006': 0.571808510638298,
 'TA0007': 0.6909547738693468,
 'TA0008': 0.3125,
 'TA0009': 0.521415270018622,
 'TA0010': 0.2127659574468085,
 'TA0011': 0.7176749703440094,
 'TA0040': 0.6349206349206349}

In [54]:
np.mean(list(f_score_dict.values()))

0.5597644800349322

In [44]:
label_index = list(y_test.columns).index('T1059')
s = pd.Series(naive_bayes_classifier.estimators_[label_index].feature_log_prob_[1], index=x_test.columns)
s.sort_values(ascending=False)[:20]

malware_name    -8.926368
malware         -9.684824
figure          -9.798373
file            -9.823947
exe             -9.970309
security       -10.024613
threat         -10.102563
data           -10.145757
used           -10.174743
dll            -10.186079
com            -10.187316
backdoor       -10.196884
server         -10.283662
command        -10.289561
apt            -10.349288
files          -10.420309
powershell     -10.424656
windows        -10.438944
malicious      -10.453184
using          -10.477197
dtype: float64

In [45]:
label_index = list(y_test.columns).index('T1059')
s = pd.Series(naive_bayes_classifier.estimators_[label_index].feature_log_prob_[0], index=x_test.columns)
s.sort_values(ascending=False)[:20]

malware_name   -8.580369
malware        -9.071352
file           -9.104083
security       -9.200999
exe            -9.394428
windows        -9.481843
com            -9.569012
figure         -9.631102
threat         -9.632430
data           -9.644766
used           -9.660471
dll            -9.686135
process        -9.721100
ransomware     -9.747026
files          -9.785526
user           -9.808118
using          -9.836171
code           -9.847254
new            -9.861232
mandiant       -9.881338
dtype: float64

In [46]:
# Most frequent words in urls per technique:

for label_index, col in enumerate(y_test.columns):
    print(col)
    print()
    s1 = pd.Series(naive_bayes_classifier.estimators_[label_index].feature_log_prob_[1], index=x_test.columns)
    s2 = pd.Series(naive_bayes_classifier.estimators_[label_index].feature_log_prob_[0], index=x_test.columns)

    print((s1 - s2).sort_values(ascending=False)[:50])
    print()

T1001

ketrican             0.587950
redbaldknight        0.515153
scarlet              0.443699
quarian              0.420774
brothers             0.400952
crossbow             0.396235
backdoordiplomacy    0.377603
shrouded             0.375332
sdvro                0.314626
taskframe            0.280837
xxmm                 0.263228
abbob                0.259044
jri                  0.254791
iji                  0.254504
avsvmcloud           0.249427
crl                  0.245384
businesslayer        0.243204
dicemention          0.238153
micrnet              0.238153
quant                0.229874
grim                 0.223171
contentlist          0.205836
kjdhc                0.202042
sarmsoftware         0.199841
pcrat                0.195279
cloudns              0.192761
coordinator          0.188834
bing                 0.184613
blocklisted          0.181057
docto                0.177154
woff                 0.177133
appsync              0.177078
chang                0.176114
ser

T1010

cadelle                 0.518597
aria                    0.453735
crossbow                0.398073
shrouded                0.377170
pact                    0.319252
sharper                 0.318685
mdl                     0.294181
creature                0.245204
cryptui                 0.234371
fantasy                 0.224505
teamatt                 0.218659
xsfunction              0.212188
evasiondescription      0.210461
rentbikespb             0.206639
mediafire               0.201456
txnid                   0.193496
afead                   0.193496
activemime              0.179551
rrrbiebl                0.178713
foreground              0.169433
drigo                   0.169430
bifrose                 0.169430
discoverygrandoreiro    0.168375
txtlist                 0.168375
discoverydescription    0.167628
initech                 0.166710
yvsvuyps                0.163698
iib                     0.161583
googlecrash             0.157246
winde                   0.148868
adm

leonardo              0.607209
ketrican              0.582017
shathak               0.513717
windigo               0.473067
threedollars          0.441738
scarlet               0.437766
kgh                   0.437507
posadadesantiago      0.435584
snakemackerel         0.420019
xmlprov               0.405933
kt                    0.398927
brothers              0.395019
managedplugin         0.380741
pluginhost            0.356626
nhs                   0.349175
irdsnhrxxxfery        0.347761
kimsuky               0.346002
sharpshooterreport    0.343576
vivisection           0.320225
sharper               0.310913
sdvro                 0.308692
duuzer                0.308089
dorusio               0.304177
mdl                   0.286410
kupay                 0.284901
cybergang             0.280297
taskframe             0.274904
aswrundll             0.267580
coingotrade           0.258608
subdir                0.257869
isfb                  0.247676
dotm                  0.242628
darkhalo

zscaler             0.620597
cofense             0.582585
ketrican            0.576564
venezuelan          0.564877
dnc                 0.561548
krcert              0.552134
keylogger           0.523183
cadelle             0.505373
redbaldknight       0.503767
errorrefresh        0.485473
interruptcss        0.485473
groundbait          0.446890
kgh                 0.436376
kisa                0.435073
scarlet             0.432313
posadadesantiago    0.430131
dcx                 0.405448
opencv              0.400196
crossbow            0.384849
stspy               0.383118
blackhat            0.382494
monsoon             0.365686
shrouded            0.363946
setwindowshookex    0.354905
td                  0.353516
radware             0.352438
keystrokes          0.352016
loginprompt         0.350794
ehdevel             0.344287
openal              0.331293
keyloggers          0.325224
smethod             0.323221
eisl                0.319423
dispatcher          0.318215
hook          

str                       0.736764
growlhelper               0.678664
zscaler                   0.640056
gandcrab                  0.634208
orangeworm                0.618895
leonardo                  0.595386
singularitysingularity    0.578348
implant                   0.570825
ketrican                  0.560386
aria                      0.544420
endobj                    0.514531
drbcontrol                0.502748
darkhotel                 0.498203
obj                       0.491811
mpaign                    0.490873
cadelle                   0.489195
scr                       0.482714
localappdata              0.460751
pnum                      0.460672
naikon                    0.459842
sfcorporation             0.458889
ofile                     0.442201
eggs                      0.440870
groundbait                0.430712
tick                      0.424347
scarcruft                 0.423273
kgh                       0.420310
pch                       0.420128
hwp                 

winnti              0.888531
botlib              0.838650
zscaler             0.640146
kt                  0.622483
leonardo            0.595476
oilrig              0.587822
locker              0.576552
ketrican            0.560476
gamaredon           0.546127
aria                0.544509
menupass            0.525529
installutil         0.509021
redbaldknight       0.498831
shathak             0.492176
autoit              0.485924
dustsquad           0.470460
interruptcss        0.469384
errorrefresh        0.469384
hwp                 0.460425
naikon              0.459932
sfcorporation       0.458978
ofile               0.442291
eggs                0.440960
advancedrun         0.427497
iron                0.425346
kgh                 0.420983
threedollars        0.420197
cactuspete          0.417386
scarlet             0.416225
posadadesantiago    0.414043
brothers            0.410990
snakemackerel       0.398478
quarian             0.393300
wwansvc             0.392982
dcx           

cadelle              0.516896
groundbait           0.458413
quarian              0.420911
backdoordiplomacy    0.377739
monsoon              0.377208
darkhotel            0.361781
pact                 0.317551
sharper              0.316983
tribe                0.300917
retro                0.292549
usbworm              0.279891
frontier             0.278744
weap                 0.258706
listdatatypes        0.246750
creature             0.243503
gallagherseanm       0.233487
quant                0.230011
isys                 0.226892
cybergang            0.226129
fantasy              0.222804
plc                  0.216963
rentbikespb          0.204938
kjdhc                0.202179
detaillevel          0.198512
ammyy                0.198216
hwi                  0.191231
smth                 0.191201
transparent          0.190477
dossier              0.189152
wiknet               0.188715
vrun                 0.183694
analogue             0.176483
audiorecorder        0.176383
ghsnls    

orangeworm                   0.647073
xhr                          0.477592
indrik                       0.384666
procedurespage               0.378818
darkhotel                    0.362258
retro                        0.293026
eventsdatehostpidactivity    0.277676
bruteforces                  0.275720
rdpscandll                   0.265110
dinj                         0.255692
pri                          0.233380
sqlagent                     0.227031
plc                          0.217928
xhrs                         0.213711
joshdev                      0.197109
wipeshadowcopies             0.196051
drbcontrol                   0.193170
discoverytrickbot            0.192165
smth                         0.191678
dossier                      0.189629
bazarloader                  0.187977
concretely                   0.184146
fileserver                   0.183409
xmlhttp                      0.182176
analogue                     0.176960
wormdll                      0.173107
fortiedr    

wnry                                  1.136762
logrhythm                             0.549082
xsoar                                 0.533871
locker                                0.449919
indrik                                0.421239
wanadecryptor                         0.419675
wwansvc                               0.418472
alqatac                               0.399058
maoloa                                0.376721
kms                                   0.366841
hermes                                0.348165
protonmail                            0.339230
btc                                   0.327994
cobralocker                           0.326009
smbv                                  0.323501
cortex                                0.313194
shadowstorage                         0.304502
tasksche                              0.302129
spider                                0.269734
teslacrypt                            0.253058
slowik                                0.248348
resize       


T1546

emond                       0.748020
growlhelper                 0.704925
amnesia                     0.533177
narrator                    0.522478
zsh                         0.478975
sigspec                     0.476913
zshrc                       0.434334
zshenv                      0.426340
dennis                      0.372177
trap                        0.366449
bcde                        0.340799
bashrc                      0.333233
ssonsvr                     0.327732
mshlpsrvc                   0.324035
eventfilter                 0.318255
objattr                     0.316650
compfun                     0.313209
hupigon                     0.299005
rabbot                      0.298248
ltmanager                   0.292127
retro                       0.291109
netwf                       0.283157
glitch                      0.279493
kaiji                       0.278570
fsevents                    0.258562
comsysapp                   0.256123
dscl                        0.

xpc                        0.604285
sucuri                     0.453938
onenote                    0.392250
scriptcontrol              0.323950
msword                     0.317846
ishaq                      0.313655
ddeauto                    0.309488
retro                      0.293924
addcode                    0.263566
mohammed                   0.262111
msoftupdates               0.261162
agelsevirine               0.244057
dde                        0.241587
cvmserver                  0.234576
pri                        0.234278
dinj                       0.231856
algorithmsdifference       0.229521
apologize                  0.220157
lulu                       0.216806
dotnettojscript            0.216107
formula                    0.215322
msexcel                    0.212719
contentlist                0.207348
inconvenience              0.205932
preemptive                 0.205430
cyberdefense               0.204767
sensepost                  0.202525
herokuapp                  0

kt                     0.425212
elfin                  0.358348
chfeeds                0.261928
msoftupdates           0.261196
zxfunction             0.259724
jri                    0.256337
iji                    0.256050
mynetwork              0.251901
cryptui                0.234079
trickloader            0.227452
nukesped               0.208827
softlayer              0.198550
herokuapp              0.195529
loveusd                0.188910
mainconnectionio       0.188910
descrip                0.188639
systeminf              0.188639
bing                   0.186159
botlib                 0.177396
adeel                  0.173362
trickload              0.171189
ncsc                   0.169396
cdab                   0.164641
symcb                  0.160667
printcache             0.158166
shellmain              0.151530
dllinj                 0.151328
salinas                0.151328
ssaveaspath            0.150143
administrador          0.146936
gvty                   0.145139
digicert

In [224]:
np.mean(list(f1_dict.values()))

0.03993704300141384

In [134]:
multilabel_confusion_matrix(y_test, y_pred).shape

(73, 2, 2)

In [40]:
# create a dataframe to understand why accuracy is so low: 

df_res = pd.DataFrame({'y_true': y_test, 'y_pred': y_pred})


In [41]:
df_res

Unnamed: 0,y_true,y_pred
1518,T0811,T1558
1246,T1614.001,T1546.004
544,T1588,T1558
1343,T1221,T1546.004
428,T1584,T1558
...,...,...
174,T1079,T1546.004
387,T1499,T1558
1561,T0848,T1558
846,T1183,T1558


In [50]:
index_usedWords = np.flatnonzero(df_dtm.iloc[0, :] > 0)

In [53]:
df_dtm.iloc[0, :][index_usedWords].index.tolist()

['11',
 '16',
 '2005',
 '2012',
 '2019',
 '2022',
 '37',
 '64',
 'ADMIN',
 'AM',
 'All',
 'Applications',
 'Attacks',
 'Aug',
 'Back',
 'Boot',
 'Broadcom',
 'Buy',
 'Chain',
 'Community',
 'Company',
 'Component',
 'Cookie',
 'Copies',
 'Copyright',
 'Creates',
 'DOMAIN',
 'Deletes',
 'Display',
 'Distributed',
 'Downloads',
 'Dropper',
 'Drops',
 'Endpoint',
 'Enter',
 'Enterprise',
 'Entries',
 'Events',
 'Executes',
 'FROM',
 'Figure',
 'Files',
 'Finally',
 'GET',
 'HTTP',
 'Higher',
 'Home',
 'Hosted',
 'How',
 'IP',
 'Image',
 'Inc',
 'Information',
 'It',
 'JPEG',
 'Johnson',
 'Keywords',
 'LLC',
 'Library',
 'Link',
 'Links',
 'Logic',
 'MBR',
 'Master',
 'Members',
 'NAME',
 'No',
 'Note',
 'Only',
 'Options',
 'Policy',
 'Powered',
 'Press',
 'Privacy',
 'Products',
 'Protection',
 'Recommend',
 'Record',
 'Register',
 'Related',
 'Reporter',
 'Reserved',
 'Resource',
 'Response',
 'Rights',
 'STATE',
 'Search',
 'Security',
 'Server',
 'Service',
 'Shamoon',
 'Shares',
 'Si

In [54]:
df_concat.iloc[0, -1]

'T1487'