In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, fbeta_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import multilabel_confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from skmultilearn.adapt import MLkNN

import pandas as pd
import json
import os
import re

import spacy
import matplotlib
import plotly.express as px
import plotly.subplots as sp
from plotly.subplots import make_subplots
from ast import literal_eval
from tqdm import tqdm

import sklearn.metrics
import numpy as np

In [2]:
ics_data = '../src/training_dataset_full.csv'

In [3]:
# transform into dataframes: 

df = pd.read_csv(ics_data).reset_index(drop = True)

In [4]:
# Converting from string to list using literal_eval:

for col in ['mitre_domain', 'tech_name', 'tech_id', 'tactic_id', 'software_id']:
    df[col] = df[col].apply(literal_eval)

In [7]:
# retrieve ics attacks from dataset: 
def has_ics(domain):
    return 'ics-attack' in domain or 'ICS' in domain

df_ics = df[df['mitre_domain'].apply(has_ics)]

In [8]:

mlb = MultiLabelBinarizer()
Y_s = mlb.fit_transform(df_ics['tactic_id'])
Y_s = pd.DataFrame(Y_s, columns=mlb.classes_)

In [12]:
Y_s = Y_s[[col for col in Y_s if col.startswith('TA01')]]
Y_s.sum(axis=0)

TA0100    22
TA0101     5
TA0102     9
TA0103    10
TA0104    12
TA0105    28
TA0106     6
TA0107    20
TA0108    36
TA0109     9
TA0110     7
TA0111     1
dtype: int64

In [14]:
Y_s[Y_s['TA0111'] == 1]

Unnamed: 0,TA0100,TA0101,TA0102,TA0103,TA0104,TA0105,TA0106,TA0107,TA0108,TA0109,TA0110,TA0111
82,1,1,1,1,1,1,1,1,1,1,1,1


In [21]:
Y_s = Y_s.drop(columns='TA0111')

In [22]:
tf_idf = TfidfVectorizer(analyzer = 'word', stop_words='english', lowercase=True, min_df=2, max_df=0.99)

data = tf_idf.fit_transform(df_ics['text'])

X = pd.DataFrame(data.toarray(), columns=tf_idf.get_feature_names()) 



In [25]:
x_train, x_test, y_train, y_test = train_test_split(X, Y_s, test_size=0.3,
                                                    random_state = 10)

sv_classifier = OneVsRestClassifier(LinearSVC(penalty = 'l2', loss = 'squared_hinge', dual = False, max_iter = 1000, class_weight = 'balanced'), n_jobs = 1)
sv_classifier.fit(x_train, y_train)


OneVsRestClassifier(estimator=LinearSVC(class_weight='balanced', dual=False),
                    n_jobs=1)

In [28]:
Y_pred = pd.DataFrame(sv_classifier.predict(x_test), columns = y_test.columns)

In [31]:
precision_score(y_test, Y_pred, average ='macro')

  _warn_prf(average, modifier, msg_start, len(result))


0.1545454545454545

In [32]:
fbeta_score(y_test, Y_pred, beta=0.5, average ='macro')

0.13840039741679086

In [33]:
recall_score(y_test, Y_pred, average ='macro')

0.10821678321678321