In [206]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, fbeta_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import multilabel_confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
import json
import os
import re

import spacy
import matplotlib
import plotly.express as px
import plotly.subplots as sp
from plotly.subplots import make_subplots
from ast import literal_eval
from tqdm import tqdm

import sklearn.metrics
import numpy as np

# Opening Files: 

In [207]:
rcatt_data = '../src/rcatt_training_data_original.csv'
scraped_data = '../src/training_dataset_full.csv'

In [208]:
# transform into dataframes: 

df_r = pd.read_csv(rcatt_data).reset_index(drop = True)
df_r = df_r[~df_r['Text'].duplicated()]
df_s = pd.read_csv(scraped_data).reset_index(drop = True).rename(columns={'text': 'Text'})

In [209]:
# Converting from string to list using literal_eval:

for col in ['mitre_domain', 'tech_name', 'tech_id', 'tactic_id', 'software_id']:
    df_s[col] = df_s[col].apply(literal_eval)

# Merging Files: 

In [191]:

mlb = MultiLabelBinarizer()
Y_s = mlb.fit_transform(df_s['tactic_id'])
Y_s = pd.DataFrame(Y_s, columns=mlb.classes_)

In [192]:
X_r = df_r['Text']
Y_r = df_r[[col for col in df_r.columns if col.startswith('TA')]]

In [193]:
Y1 = Y_s[Y_r.columns]
Y_s = Y1[Y1.sum(axis=1)>0] 
X_s = df_s['Text']
X_s = X_s[Y1.sum(axis=1)>0] # all urls who map at least one of the techniques in Y1

In [194]:
X_r_train, X_test_text, Y_r_train, Y_test = train_test_split(X_r, Y_r, test_size=0.3,
                                                    random_state = 10)

In [195]:
X_train_text = pd.concat([X_r_train, X_s])

In [196]:
Y_train = pd.concat([Y_r_train, Y_s]).reset_index(drop=True)

In [197]:
# ------------ Count Vectorizer --------------- 

# cv = CountVectorizer(analyzer='word', stop_words='english', lowercase=False,
                        #min_df=0.01) # if words used less than 0.001 % --> ignore  
# data = cv.fit_transform(df_tech['text']) 

# df_dtm = pd.DataFrame(data.toarray(), columns=cv.get_feature_names())

# ---------------- TF-IDF ---------------------: 


tf_idf = TfidfVectorizer(analyzer = 'word', stop_words='english', lowercase=True, min_df=2, max_df=0.99)

X_train = tf_idf.fit_transform(X_train_text)

X_train = pd.DataFrame(X_train.toarray(), columns=tf_idf.get_feature_names()) 

X_test = tf_idf.transform(X_test_text)

X_test = pd.DataFrame(X_test.toarray(), columns=tf_idf.get_feature_names()) 




# Measuring Cosine Similarity to Remove Duplicates: 

In [198]:
similarities = cosine_similarity(X_train, X_test)

In [199]:
duplicates = set()
for i in range(similarities.shape[0]):
    for j in range(similarities.shape[1]):
        if similarities[i][j] > 0.9:
            # print(i, j, similarities[i][j])
            duplicates.add(i)

In [200]:
X_train = X_train[~X_train.index.isin(duplicates)]
Y_train = Y_train[~Y_train.index.isin(duplicates)]

# Feature Extraction:

In [201]:
# Train and test: First delete techniques less than 9 
# We fix the random state to have the same dataset in our different tests

sv_classifier = OneVsRestClassifier(LinearSVC(penalty = 'l2', loss = 'squared_hinge', dual = False, max_iter = 1000, class_weight = 'balanced', random_state=42), n_jobs = 1)
sv_classifier.fit(X_train, Y_train)


OneVsRestClassifier(estimator=LinearSVC(class_weight='balanced', dual=False,
                                        random_state=42),
                    n_jobs=1)

In [202]:
Y_pred = pd.DataFrame(sv_classifier.predict(X_test), columns=Y_test.columns)

In [203]:
f_score_dict = {}
for col in Y_test.columns:
    print(col)
    f_score_dict[col] = fbeta_score(Y_test[col], Y_pred[col],beta=0.5)
    print(classification_report(Y_test[col], Y_pred[col]))

TA0006
              precision    recall  f1-score   support

           0       0.91      0.92      0.92       352
           1       0.67      0.65      0.66        89

    accuracy                           0.87       441
   macro avg       0.79      0.79      0.79       441
weighted avg       0.86      0.87      0.87       441

TA0002
              precision    recall  f1-score   support

           0       0.86      0.89      0.88       308
           1       0.72      0.68      0.70       133

    accuracy                           0.82       441
   macro avg       0.79      0.78      0.79       441
weighted avg       0.82      0.82      0.82       441

TA0040
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       421
           1       0.59      0.65      0.62        20

    accuracy                           0.96       441
   macro avg       0.79      0.81      0.80       441
weighted avg       0.97      0.96      0.96       441

In [204]:
f_score_dict

{'TA0006': 0.6697459584295612,
 'TA0002': 0.7109004739336492,
 'TA0040': 0.6018518518518519,
 'TA0003': 0.7634543178973716,
 'TA0004': 0.5979202772963604,
 'TA0008': 0.6109979633401221,
 'TA0005': 0.7934131736526946,
 'TA0010': 0.4430379746835443,
 'TA0007': 0.7752808988764045,
 'TA0009': 0.6528189910979229,
 'TA0011': 0.7329842931937174,
 'TA0001': 0.5734767025089605}

In [205]:
np.mean(list(f_score_dict.values()))

0.6604902397301801