In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, fbeta_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import multilabel_confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from skmultilearn.adapt import MLkNN

import pandas as pd
import json
import os
import re

import spacy
import matplotlib
import plotly.express as px
import plotly.subplots as sp
from plotly.subplots import make_subplots
from ast import literal_eval
from tqdm import tqdm

import sklearn.metrics
import numpy as np

# Opening Files: 

In [2]:
rcatt_data = '../src/rcatt_training_data_original.csv'
scraped_data = '../src/training_dataset_full.csv'

In [3]:
# transform into dataframes: 

df_r = pd.read_csv(rcatt_data).reset_index(drop = True)
df_r = df_r[~df_r['Text'].duplicated()]
df_s = pd.read_csv(scraped_data).reset_index(drop = True).rename(columns={'text': 'Text'})

In [4]:
# Converting from string to list using literal_eval:

for col in ['mitre_domain', 'tech_name', 'tech_id', 'tactic_id', 'software_id']:
    df_s[col] = df_s[col].apply(literal_eval)

# Merging Files: 

In [5]:

mlb = MultiLabelBinarizer()
Y_s = mlb.fit_transform(df_s['tactic_id'])
Y_s = pd.DataFrame(Y_s, columns=mlb.classes_)

In [6]:
X_r = df_r['Text']
Y_r = df_r[[col for col in df_r.columns if col.startswith('TA')]]

In [7]:
Y1 = Y_s[Y_r.columns]
Y_s = Y1[Y1.sum(axis=1)>0] 
X_s = df_s['Text']
X_s = X_s[Y1.sum(axis=1)>0] # all urls who map at least one of the techniques in Y1

In [8]:
X_r_train, X_test_text, Y_r_train, Y_test = train_test_split(X_r, Y_r, test_size=0.3,
                                                    random_state = 10)

In [9]:
X_train_text = pd.concat([X_r_train, X_s])

In [10]:
Y_train = pd.concat([Y_r_train, Y_s]).reset_index(drop=True)

# Feature Extraction:

In [14]:
# ------------ Count Vectorizer --------------- 

# cv = CountVectorizer(analyzer ='word', stop_words ='english', lowercase = True, min_df = 2, max_df = 0.99) # if words used less than 0.001 % --> ignore  

# X_train = cv.fit_transform(X_train_text)

# X_train = pd.DataFrame(X_train.toarray(), columns = cv.get_feature_names()) 

# X_test = cv.transform(X_test_text)

# X_test = pd.DataFrame(X_test.toarray(), columns = cv.get_feature_names())                 

# ---------------- TF-IDF ---------------------: 


tf_idf = TfidfVectorizer(analyzer = 'word', stop_words='english', lowercase=True, min_df =2, max_df=0.99)

X_train = tf_idf.fit_transform(X_train_text)

X_train = pd.DataFrame(X_train.toarray(), columns=tf_idf.get_feature_names()) 

X_test = tf_idf.transform(X_test_text)

X_test = pd.DataFrame(X_test.toarray(), columns=tf_idf.get_feature_names()) 

# ---------------- Word2Vec LSTM ---------------------: 


In [24]:
X_train.shape

(2376, 74675)

In [25]:
Y_train.shape

(2376, 12)

# Measuring Cosine Similarity to Remove Duplicates: 

In [15]:
similarities = cosine_similarity(X_train, X_test)

In [16]:
duplicates = set()
for i in range(similarities.shape[0]):
    for j in range(similarities.shape[1]):
        if similarities[i][j] > 0.9:
            # print(i, j, similarities[i][j])
            duplicates.add(i)

In [17]:
X_train = X_train[~X_train.index.isin(duplicates)]
Y_train = Y_train[~Y_train.index.isin(duplicates)]

# Naive Bayes:

In [29]:
naive_bayes_classifier = OneVsRestClassifier(MultinomialNB())
naive_bayes_classifier.fit(X_train, Y_train)

OneVsRestClassifier(estimator=MultinomialNB())

In [30]:
y_pred_proba = pd.DataFrame(naive_bayes_classifier.predict_proba(X_test), columns = Y_test.columns)

In [31]:
y_pred = (y_pred_proba > 0.005).astype(int) # if increase threshold, recall decreases and precision (could) increase

In [32]:
precision_score(Y_test, y_pred, average ='macro')

0.6668051002292635

In [33]:
fbeta_score(Y_test, y_pred, beta=0.5, average ='macro')

0.3826809715685888

In [34]:
recall_score(Y_test, y_pred, average='macro')

0.30668841162625965

# SVM:

In [24]:
# Train and test: First delete techniques less than 9 
# We fix the random state to have the same dataset in our different tests

sv_classifier = OneVsRestClassifier(LinearSVC(penalty = 'l2', loss = 'squared_hinge', dual = False, max_iter = 1000, class_weight = 'balanced', random_state=42), n_jobs = 1)
sv_classifier.fit(X_train, Y_train)


KeyboardInterrupt: 

In [50]:
Y_pred = pd.DataFrame(sv_classifier.predict(X_test), columns=Y_test.columns)

In [51]:
precision_score(Y_test, y_pred, average ='macro')

0.6668051002292635

In [52]:
fbeta_score(Y_test, y_pred, beta=0.5, average ='macro')

0.3826809715685888

In [49]:
recall_score(Y_test, y_pred, average='macro')

0.30668841162625965

# Multi-label KNN: 

In [18]:
knn = MLkNN(k = 3)

In [19]:
# train
knn.fit(X_train.values, Y_train.values)

# predict
predictions = knn.predict(X_test.values)



In [21]:
fbeta_score(Y_test, predictions, beta=0.5, average ='macro')

0.4749853673285184