## <font color='orange'>Setup</font>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/Steps NLP framework/Folder_steps_material'

#### Pip installation

In [None]:
# Installation for IOC defanging - https://github.com/ioc-fang/ioc-fanger
!sudo pip install -q ioc-fanger

# Installation for Pronouns and ellipsis resolution
import os
if not os.path.exists('./neuralcoref'):
  !git clone https://github.com/huggingface/neuralcoref.git
%cd neuralcoref
!pip install -q -r requirements.txt
!pip install -q neuralcoref --no-binary neuralcoref
!python setup.py build_ext -q --inplace
!pip install -q -e .
!python -q -m spacy download en_core_web_lg

import en_core_web_lg
pass2act_nlp = en_core_web_lg.load()
ellipsis_nlp = en_core_web_lg.load()

import neuralcoref
neuralcoref.add_to_pipe(ellipsis_nlp)
%cd ./..

# Installation for Synonym homogenization
!pip install -q pattern
#!pip install -q -U spacy
!pip install -q -U torch torchvision torchaudio
!pip install -q allennlp  # https://github.com/allenai/allennlp
!pip install -q allennlp_models  # https://github.com/allenai/allennlp-models
!python -q -m spacy download en_core_web_sm

# Installation for Passive\active conversion - https://github.com/DanManN/pass2act
#!python -m spacy download en_core_web_lg

# Installation for Misspelling correction - https://github.com/bakwc/JamSpell
!!sudo apt-get install -q swig3.0
!sudo pip install -q jamspell

# Installation for Unrelated content removal
!pip install -q transformers

## <font color='orange'>Import libraries and data</font>

In [None]:
%cd '/content/drive/My Drive/Steps NLP framework/Folder_steps_material'

### Libraries

In [None]:
import os
import re
import csv
import copy
import json
import spacy
import string
from tqdm import tqdm
from copy import deepcopy
from collections import Counter
import pattern.text.en as en
from pattern.text.en import conjugate,INFINITIVE,SG

import ioc_fanger  # https://github.com/ioc-fang/ioc-fanger

from allennlp.predictors.predictor import Predictor  # https://github.com/allenai/allennlp
import allennlp_models.tagging  # https://github.com/allenai/allennlp-models

import jamspell  # https://github.com/bakwc/JamSpell

import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd

### Data

In [None]:
# JamSpell material
if not os.path.exists('en.tar.gz'):
  !wget https://github.com/bakwc/JamSpell-models/raw/master/en.tar.gz
!tar -xvf en.tar.gz

jsp = jamspell.TSpellCorrector()
assert jsp.LoadLangModel('en.bin')

In [None]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    cuda_device = 0  # You can specify the GPU index (0, 1, etc.) you want to use
else:
    device = torch.device("cpu")
    cuda_device = -1  # If no GPU is available, use CPU

In [None]:
#Unrelated content removal material
model_input_dir = 'Sentence_classifier_BERT'
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(model_input_dir)
# Load the fine-tuned model
model = BertForSequenceClassification.from_pretrained(model_input_dir)
model.to(device)

window_length = 128

In [None]:
# AllenNLP material
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz",
                                cuda_device=cuda_device)

In [None]:
# Stopwords material
stop_words = set(stopwords.words('english'))

In [None]:
# Internet slang words material
internet_slang_words = []

with open('dictionary_slang_socialmedia_slangnet.txt', "r", encoding='utf-8') as file:
    for line in file:
        internet_slang_words.append(line.strip().lower())
with open('dictionary_slang_webforums_slangnet.txt', "r", encoding='utf-8') as file:
    for line in file:
        internet_slang_words.append(line.strip().lower())
internet_slang_words = sorted(set(internet_slang_words))

In [None]:
# Aliases material
def read_alias_from_csv(filename):
  data = []

  # Use the 'r' mode to open the CSV file for reading
  with open(filename, "r", newline="") as file:
      reader = csv.reader(file)

      # Iterate through each row in the CSV file and append it to the data list
      for row in reader:
          data.append(row)

  for line in data:
    line[0] = line[0].upper()
  return data

actor_groups_aliases = read_alias_from_csv('threat_actors_aliases.csv')
malware_families_aliases = read_alias_from_csv('threat_names_aliases.csv')

### Datasets

In [None]:
%cd '/content/drive/My Drive/Steps NLP framework/Pipeline_new_data/Folder_datasets'

## <font color='orange'>Define lists and functions</font>

### Lists

#### Passive\Active Conversion

In [None]:
pass2act_noundict = {'i':'me', 'we':'us', 'you':'you', 'he':'him', 'she':'her', 'they':'them', 'them':'they', 'her':'she', 'him':'he', 'us':'we', 'me':'i'}

#### Synonym Homogenization

##### Noun lists

In [None]:
appdata_list = ['APPDATA' , 'APPDATA_', "%APPDATA%", "%AppData%", "%appdata%", "% APPDATA%", "% AppData%", "% appdata%",
                "<APPDATA>", "<AppData>", "<appdata>", "% APPDATA %", "% AppData %", "% appdata %", "% APPDATA %", "% AppData %", "% appdata %",
                "< APPDATA >", "%Appdata%", "< AppData >", "< appdata >"]

common_appdata_list = ['COMMON_APPDATA' ,"%COMMON_APPDATA%", "%common_appdata%", "%Common_Appdata%", "%COMMON_APPDATA%",
                "%common_appdata%", "%Common_Appdata%" "<COMMON_APPDATA>", "<common_appdata>", "<Common_Appdata>",
                "% COMMON_APPDATA %", "% common_appdata %", "% Common_Appdata %", "% COMMON_APPDATA %",
                "% common_appdata %", "% Common_Appdata %" "< COMMON_APPDATA >", "< common_appdata >",
                "< Common_Appdata >"]

profilename_list = ['PROFILENAME' ,"PROFILENAME_","%ProfileName%", "%PROFILENAME%", "%Profilename%", "% ProfileName%", "% PROFILENAME%",
                "% Profilename%", "<ProfileName>", "<PROFILENAME>", "<Profilename>",
                "% ProfileName %", "% PROFILENAME %", "% Profilename %", "% ProfileName %", "% PROFILENAME %",
                "% Profilename %", "< ProfileName >", "< PROFILENAME >", "< Profilename >",
                "%profilename%", "% profilename%","% profilename %", "<profilename>", "< profilename>","< profilename >"]

username_list = ['USERNAME' ,"USERNAME_","%UserName%", "%USERNAME%", "%username%", "% UserName%", "% USERNAME%", "% username%",
                "<UserName>", "<USERNAME>", "<username>", "<Username>",
                "% UserName %", "% USERNAME %", "% username %", "% UserName %", "% USERNAME %", "% username %",
                "< UserName >", "< USERNAME >", "< username >", "< Username >"]

temp_list = ['TEMP' ,"TEMP_", "%temp%", "%TEMP%", "%Temp%", "% temp%", "% TEMP%",
                "% Temp%", "<temp>", "<TEMP>", "<Temp>",
                "% temp %", "% TEMP %", "% Temp %", "% temp %", "% TEMP %",
                "% Temp %", "< temp >", "< TEMP >", "< Temp >", "<Windows temporary folder>","<temporary folder>" ,"%temporary folder%" ]

userprofile_list = ['USERPROFILE' , "USERPROFILE_", "%UserProfile%", "%UserProfile%", "%UserProfile%", "%USERPROFILE%","%Userprofile%",
                "%userprofile%", "<UserProfile>", "<USERPROFILE>", "<userprofile>","<Userprofile>",
                "% UserProfile%", "% UserProfile%", "% UserProfile%", "% USERPROFILE%","< Userprofile>",
                "% userprofile%", "< UserProfile>", "< USERPROFILE>", "< userprofile>","< Userprofile >",
                "% UserProfile %", "% UserProfile %", "% UserProfile %", "% USERPROFILE %","% Userprofile%",
                "% userprofile %", "< UserProfile >", "< USERPROFILE >", "< userprofile >","% Userprofile %",
                "% UserProfile %"]

systemroot_list = ['SYSTEMROOT' ,"SYSTEMROOT_", "%SystemRoot%", "% SystemRoot %", "% SystemRoot%","%SYSTEMROOT%", "% SYSTEMROOT %", "% SYSTEMROOT%",
                "%systemroot%", "% systemroot %", "% systemroot%", "%Systemroot%", "% Systemroot %", "% Systemroot%",
                "<SystemRoot>", "< SystemRoot >", "< SystemRoot>", "<SYSTEMROOT>", "< SYSTEMROOT >","< SYSTEMROOT>",
                "<systemroot>", "< systemroot >", "< systemroot>", "<Systemroot>", "< Systemroot >","< Systemroot>"]

windows_list = ['WINDOWS' , "WINDOWS_","%WINDOWS%", "% WINDOWS %", "% WINDOWS%", "%Windows%", "% Windows %", "% Windows%","%windows%", "% windows %", "% windows%",
                "<WINDOWS>", "< WINDOWS >", "< WINDOWS>", "<Windows>", "< Windows >", "< Windows>","<windows>", "< windows >", "< windows>"]

windir_list = ['WINDIR' ,"WINDIR_","%WINDIR%", "% WINDIR %", "% WINDIR%", "%Windir%", "% Windir %", "% Windir%", "%windir%","% windir %", "% windir%",
                "<WINDIR>", "< WINDIR >", "< WINDIR>", "<Windir>", "< Windir >", "< Windir>", "<windir>","< windir >", "< windir>" ]

defaultuserprofile =['DEFAULTUSERPROFILE' ,"DEFAULTUSERPROFILE_", "%DEFAULTUSERPROFILE%", "% DEFAULTUSERPROFILE %", "% DEFAULTUSERPROFILE%",
                "%defaultuserprofile%", "% defaultuserprofile %", "% defaultuserprofile%",
                "%Defaultuserprofile%", "% Defaultuserprofile %", "% Defaultuserprofile%",
                "%DefaultUserProfile%", "% DefaultUserProfile %", "% DefaultUserProfile%",
                "<DEFAULTUSERPROFILE>", "< DEFAULTUSERPROFILE >", "< DEFAULTUSERPROFILE>",
                "<defaultuserprofile>", "< defaultuserprofile >", "< defaultuserprofile>",
                "<Defaultuserprofile>", "< Defaultuserprofile >", "< Defaultuserprofile>",
                "<DefaultUserProfile>", "< DefaultUserProfile >", "< DefaultUserProfile>"]

homepath_list = ['HOMEPATH' ,"HOMEPATH_","%HOMEPATH%", "% HOMEPATH %", "% HOMEPATH%",
                "%HomePath%", "% HomePath %", "% HomePath%",
                "%homepath%", "% homepath %", "% homepath%",
                "%Homepath%", "% Homepath %", "% Homepath%",
                "<HOMEPATH>", "< HOMEPATH >", "< HOMEPATH>",
                "<HomePath>", "< HomePath >", "< HomePath>",
                "<homepath>", "< homepath >", "< homepath>",
                "<Homepath>", "< Homepath >", "< Homepath>"]

homefolder_list = ['HOMEFOLDER' ,"HOMEFOLDER_" ,"HOMEFOLDER", "%HOMEFOLDER%", "% HOMEFOLDER %", "% HOMEFOLDER%",
                "%HomeFolder%", "% HomeFolder %", "% HomeFolder%",
                "%homefolder%", "% homefolder %", "% homefolder%",
                "%Homefolder%", "% Homefolder %", "% Homefolder%",
                "<HOMEFOLDER>", "< HOMEFOLDER >", "< HOMEFOLDER>",
                "<HomeFolder>", "< HomeFolder >", "< HomeFolder>",
                "<homefolder>", "< homefolder >", "< homefolder>",
                "<Homefolder>", "< Homefolder >", "< Homefolder>"]

programfiles_list = ['PROGRAMFILES' ,"PROGRAMFILES_", "%PROGRAMFILES%", "% PROGRAMFILES %", "% PROGRAMFILES%",
                "%ProgramFiles%", "% ProgramFiles %", "% ProgramFiles%",
                "%programfiles%", "% programfiles %", "% programfiles%",
                "%Programfiles%", "% Programfiles %", "% Programfiles%",
                "<PROGRAMFILES>", "< PROGRAMFILES >", "< PROGRAMFILES>",
                "<ProgramFiles>", "< ProgramFiles >", "< ProgramFiles>",
                "<programfiles>", "< programfiles >", "< programfiles>",
                "<Programfiles>", "< Programfiles >", "< Programfiles>"]

programfile_list = ['PROGRAMFILE' ,"PROGRAMFILE_", "%PROGRAMFILE%", "% PROGRAMFILE %", "% PROGRAMFILE%",
                "%ProgramFile%", "% ProgramFile %", "% ProgramFile%",
                "%programfile%", "% programfile %", "% programfile%",
                "%Programfile%", "% Programfile %", "% Programfile%",
                "<PROGRAMFILE>", "< PROGRAMFILE >", "< PROGRAMFILE>",
                "<ProgramFile>", "< ProgramFile >", "< ProgramFile>",
                "<programfile>", "< programfile >", "< programfile>",
                "<Programfile>", "< Programfile >", "< Programfile>"]

systemfolder_list = ['SYSTEMFOLDER' ,"SYSTEMFOLDER_","%SYSTEMFOLDER%", "% SYSTEMFOLDER %", "% SYSTEMFOLDER%",
                "%SystemFolder%", "% SystemFolder %", "% SystemFolder%",
                "%systemfolder%", "% systemfolder %", "% systemfolder%",
                "%Systemfolder%", "% Systemfolder %", "% Systemfolder%",
                "<SYSTEMFOLDER>", "< SYSTEMFOLDER >", "< SYSTEMFOLDER>",
                "<SystemFolder>", "< SystemFolder >", "< SystemFolder>",
                "<systemfolder>", "< systemfolder >", "< systemfolder>",
                "<Systemfolder>", "< Systemfolder >", "< Systemfolder>",
                "%SYSTEM FOLDER%", "% SYSTEM FOLDER %", "% SYSTEM FOLDER%",
                "%System Folder%", "% System Folder %", "% System Folder%",
                "%system folder%", "% system folder %", "% system folder%",
                "%System folder%", "% System folder %", "% System folder%",
                "<SYSTEM FOLDER>", "< SYSTEM FOLDER >", "< SYSTEM FOLDER>",
                "<System Folder>", "< System Folder >", "< System Folder>",
                "<system folder>", "< system folder >", "< system folder>",
                "<System folder>", "< System folder >", "< System folder>"]

systemdrives_list = ['SYSTEMDRIVES' ,"SYSTEMDRIVEs_", "%SYSTEMDRIVEs%", "% SYSTEMDRIVEs %", "% SYSTEMDRIVEs%",
                "%SystemDrives%", "% SystemDrives %", "% SystemDrives%",
                "%systemdrives%", "% systemdrives %", "% systemdrives%",
                "%Systemdrives%", "% Systemdrives %", "% Systemdrives%",
                "<SYSTEMDRIVEs>", "< SYSTEMDRIVEs >", "< SYSTEMDRIVEs>",
                "<SystemDrives>", "< SystemDrives >", "< SystemDrives>",
                "<systemdrives>", "< systemdrives >", "< systemdrives>",
                "<Systemdrives>", "< Systemdrives >", "< Systemdrives>",
                "%SYSTEM DRIVEs%", "% SYSTEM DRIVEs %", "% SYSTEM DRIVEs%",
                "%System Drives%", "% System Drives %", "% System Drives%",
                "%system drives%", "% system drives %", "% system drives%",
                "%System drives%", "% System drives %", "% System drives%",
                "<SYSTEM DRIVEs>", "< SYSTEM DRIVEs >", "< SYSTEM DRIVEs>",
                "<System Drives>", "< System Drives >", "< System Drives>",
                "<system drives>", "< system drives >", "< system drives>",
                "<System drives>", "< System drives >", "< System drives>"
                "%SYSTEMDRIVE%", "% SYSTEMDRIVE %", "% SYSTEMDRIVE%",
                "%SystemDRIVE%", "% SystemDRIVE %", "% SystemDRIVE%",
                "%systemDRIVE%", "% systemDRIVE %", "% systemDRIVE%",
                "%SystemDRIVE%", "% SystemDRIVE %", "% SystemDRIVE%",
                "<SYSTEMDRIVE>", "< SYSTEMDRIVE >", "< SYSTEMDRIVE>",
                "<SystemDRIVE>", "< SystemDRIVE >", "< SystemDRIVE>",
                "<systemDRIVE>", "< systemDRIVE >", "< systemDRIVE>",
                "<SystemDRIVE>", "< SystemDRIVE >", "< SystemDRIVE>",
                "%SYSTEM DRIVE%", "% SYSTEM DRIVE %", "% SYSTEM DRIVE%",
                "%System DRIVE%", "% System DRIVE %", "% System DRIVE%",
                "%system DRIVE%", "% system DRIVE %", "% system DRIVE%",
                "%System DRIVE%", "% System DRIVE %", "% System DRIVE%",
                "<SYSTEM DRIVE>", "< SYSTEM DRIVE >", "< SYSTEM DRIVE>",
                "<System DRIVE>", "< System DRIVE >", "< System DRIVE>",
                "<system DRIVE>", "< system DRIVE >", "< system DRIVE>"]


system_list = [ 'SYSTEM' ,"SYSTEM_","%SYSTEM%", "% SYSTEM %", "% SYSTEM%",
                "%System%", "% System %", "% System%",
                "%system%", "% system %", "% system%",
                "<SYSTEM>", "< SYSTEM >", "< SYSTEM>",
                "<System>", "< System >", "< System>",
                "<system>", "< system >", "< system>"]

system32_list = ['SYSTEM32' ,"SYSTEM32_", "%SYSTEM32%", "% SYSTEM32 %", "% SYSTEM32%",
                "%System32%", "% System32 %", "% System32%",
                "%system32%", "% system32 %", "% system32%",
                "<SYSTEM32>", "< SYSTEM32 >", "< SYSTEM32>",
                "<System32>", "< System32 >", "< System32>",
                "<system32>", "< system32 >", "< system32>"]

empty_list = ['EMPTY' ,"EMPTY_", "%EMPTY%", "% EMPTY %", "% EMPTY%",
                "%Empty%", "% Empty %", "% Empty%",
                "%empty%", "% empty %", "% empty%",
                "<EMPTY>", "< EMPTY >", "< EMPTY>",
                "<Empty>", "< Empty >", "< Empty>",
                "<empty>", "< empty >", "< empty>"]

random_letters_list = ['RANDOM_LETTERS' ,'RANDOM_LETTER', "%RANDOM LETTERS%", "% RANDOM LETTERS %", "% RANDOM LETTERS%",
                "%Random Letters%", "% Random Letters %", "% Random Letters%",
                "%random letters%", "% random letters %", "% random letters%",
                "<RANDOM LETTERS>", "< RANDOM LETTERS >", "< RANDOM LETTERS>",
                "<Random Letters>", "< Random Letters >", "< Random Letters>",
                "<random letters>", "< random letters >", "< random letters>",
                "%RANDOM LETTER%", "% RANDOM LETTER %", "% RANDOM LETTER%",
                "%Random Letter%", "% Random Letter %", "% Random Letter%",
                "%random letter%", "% random letter %", "% random letter%",
                "<RANDOM LETTER>", "< RANDOM LETTER >", "< RANDOM LETTER>",
                "<Random Letter>", "< Random Letter >", "< Random Letter>",
                "<random letter>", "< random letter >", "< random letter>",
                "%RANDOM_LETTERS%", "% RANDOM_LETTERS %", "% RANDOM_LETTERS%",
                "%Random_Letters%", "% Random_Letters %", "% Random_Letters%",
                "%random_letters%", "% random_letters %", "% random_letters%",
                "<RANDOM_LETTERS>", "< RANDOM_LETTERS >", "< RANDOM_LETTERS>",
                "<Random_Letters>", "< Random_Letters >", "< Random_Letters>",
                "<random_letters>", "< random_letters >", "< random_letters>",
                "%RANDOM_LETTER%", "% RANDOM_LETTER %", "% RANDOM_LETTER%",
                "%Random_Letter%", "% Random_Letter %", "% Random_Letter%",
                "%random_letter%", "% random_letter %", "% random_letter%",
                "<RANDOM_LETTER>", "< RANDOM_LETTER >", "< RANDOM_LETTER>",
                "<Random_Letter>", "< Random_Letter >", "< Random_Letter>",
                "<random_letter>", "< random_letter >", "< random_letter>"]

random_numbers_list = ['RANDOM_NUMBERS' ,'RANDOM_NUMBER', "%RANDOM NUMBERS%", "% RANDOM NUMBERS %", "% RANDOM NUMBERS%",
                "%Random Numbers%", "% Random Numbers %", "% Random Numbers%",
                "%random numbers%", "% random numbers %", "% random numbers%",
                "<RANDOM NUMBERS>", "< RANDOM NUMBERS >", "< RANDOM NUMBERS>",
                "<Random Numbers>", "< Random Numbers >", "< Random Numbers>",
                "<random numbers>", "< random numbers >", "< random numbers>",
                "%RANDOM NUMBER%", "% RANDOM NUMBER %", "% RANDOM NUMBER%",
                "%Random Number%", "% Random Number %", "% Random Number%",
                "%random number%", "% random number %", "% random number%",
                "<RANDOM NUMBER>", "< RANDOM NUMBER >", "< RANDOM NUMBER>",
                "<Random Number>", "< Random Number >", "< Random Number>",
                "<random number>", "< random number >", "< random number>",
                "%RANDOM_NUMBERS%", "% RANDOM_NUMBERS %", "% RANDOM_NUMBERS%",
                "%Random_Numbers%", "% Random_Numbers %", "% Random_Numbers%",
                "%random_numbers%", "% random_numbers %", "% random_numbers%",
                "<RANDOM_NUMBERS>", "< RANDOM_NUMBERS >", "< RANDOM_NUMBERS>",
                "<Random_Numbers>", "< Random_Numbers >", "< Random_Numbers>",
                "<random_numbers>", "< random_numbers >", "< random_numbers>",
                "%RANDOM_NUMBER%", "% RANDOM_NUMBER %", "% RANDOM_NUMBER%",
                "%Random_Number%", "% Random_Number %", "% Random_Number%",
                "%random_number%", "% random_number %", "% random_number%",
                "<RANDOM_NUMBER>", "< RANDOM_NUMBER >", "< RANDOM_NUMBER>",
                "<Random_Number>", "< Random_Number >", "< Random_Number>",
                "<random_number>", "< random_number >", "< random_number>"]

command_and_control_list = ["COMMAND_AND_CONTROL", "c&c server", "c&c",
                "command and control sever", "command and control",
                "c2 server", "C2 server", "c2", "C2",
                "candc server", "candc", "cc server", "CC server",
                "command & control sever", "command & control",
                "command & controle sever", "Command & Controle sever", "Command & Controle",
                "CandC server", "CandC", "CnC server", "CnC"]

##### Phrasal Verb lists

In [None]:
try_going_to_list = ['tries to', 'try to', 'tried to', 'trying to',
               'attempts to' , 'attempt to', 'attempted to', 'attempting to',
               'am going to', 'is going to', 'are going to', 'was going to', 'were going to']

capable_of_list = ['is capable of', 'are capable of', 'was capable of', 'were capable of', 'being capable of']

make_modification_list = ['makes the following registry modifications', 'make the following registry modifications',
                          'made the following registry modifications', 'making the following registry modifications',
                          'makes the following registry modification', 'make the following registry modification',
                          'made the following registry modification', 'making the following registry modification',
                          'makes the following additional registry modifications', 'make the following additional registry modifications',
                          'made the following additional registry modifications', 'making the following additional registry modifications',
                          'makes the following additional registry modification', 'make the following additional registry modification',
                          'made the following additional registry modification', 'making the following additional registry modification',
                          'does the following registry modifications', 'do the following registry modifications',
                          'did the following registry modifications', 'doing the following registry modifications',
                          'does the following registry modification', 'do the following registry modification',
                          'did the following registry modification', 'doing the following registry modification',
                          'by making the following registry modifications', 'by making the following registry modification',
                          'makes a further registry modification', 'make a further registry modification',
                          'made a further registry modification', 'making a further registry modification',
                          'performs a further registry modification', 'perform a further registry modification',
                          'performed a further registry modification', 'performing a further registry modification',
                          'does a further registry modification', 'do a further registry modification',
                          'did a further registry modification', 'doing a further registry modification',
                          'makes the following modifications to', 'make the following modifications to',
                          'made the following modifications', 'making the following modifications to',
                          'makes the following modification to', 'make the following modification to',
                          'made the following modification to', 'making the following modification to',
                          'makes following modification to', 'make following modification to',
                          'made following modification to', 'making following modification to',
                          'makes following modifications to', 'make following modifications to',
                          'made following modifications to', 'making following modifications to', ]

the_following_list = ['the following possibly malicious websites', 'The following possibly malicious websites',
                      'the following possibly malicious sites', 'The following possibly malicious sites', 'the following folders and subfolders',
                      'The following folders and subfolders', 'the following possibly malicious URL', 'The following possibly malicious URL',
                      'the following folders and file(s)', 'The following folders and file(s)', 'the following malicious websites',
                      'The following malicious websites', 'the following registry locations', 'The following registry locations',
                      'the following registry location', 'The following registry location', 'the following folders and files',
                      'The following folders and files', 'the following registry subkeys', 'The following registry subkeys',
                      'the following copies of itself', 'The following copies of itself', 'the following registry entries',
                      'The following registry entries', 'the following folders and file', 'The following folders and file',
                      'the following registry subkey', 'The following registry subkey', 'the following registry values', 'The following registry values',
                      'the following malicious sites', 'The following malicious sites', 'the following subdirectories', 'The following subdirectories',
                      'the following registry entry', 'The following registry entry', 'the following copy of itself', 'The following copy of itself',
                      'the following registry value', 'The following registry value', 'the following registry paths', 'The following registry paths',
                      'the following registry keys', 'The following registry keys', 'the following malicious URL', 'The following malicious URL',
                      'the following registry path', 'The following registry path', 'the following hash file(s)', 'The following hash file(s)',
                      'the following IP addresses', 'The following IP addresses', 'the following registry key', 'The following registry key',
                      'the following directories', 'The following directories', 'the following subfolders:', 'The following subfolders:',
                      'the following web servers', 'The following web servers', 'the following web server', 'The following web server',
                      'the following hash files', 'The following hash files', 'the following IP address', 'The following IP address',
                      'the following copies of', 'The following copies of', 'the following webservers', 'The following webservers', 'the following directory',
                      'The following directory', 'the following web sites', 'The following web sites', 'the following webserver', 'The following webserver',
                      'the following locations', 'The following locations', 'the following hash file', 'The following hash file', 'the following addresses',
                      'The following addresses', 'the following copy of', 'The following copy of', 'the following web site', 'The following web site',
                      'the following websites', 'The following websites', 'the following location', 'The following location', 'the following registry',
                      'The following registry', 'the below directories', 'The below directories', 'the following servers', 'The following servers',
                      'the following folders', 'The following folders', 'the following file(s)', 'The following file(s)', 'the following website',
                      'The following website', 'the following address', 'The following address', 'the following folder', 'The following folder',
                      'the following hashes', 'The following hashes', 'the following server', 'The following server', 'the following paths', 'The following paths',
                      'the following sites', 'The following sites', 'the following files', 'The following files', 'the following lines', 'The following lines',
                      'the following items', 'The following items', 'the below locations', 'The below locations', 'the below directory', 'The below directory',
                      'the following URLs', 'The following URLs', 'the below websites', 'The below websites', 'the below registry', 'The below registry',
                      'the following file', 'The following file', 'the following path', 'The following path', 'the below location', 'The below location',
                      'the following line', 'The following line', 'the registry value', 'The registry value', 'the following site', 'The following site',
                      'the following hash', 'The following hash', 'the following IPs', 'The following IPs', 'the below file(s)', 'The below file(s)',
                      'the following URL', 'The following URL', 'the following IP', 'The following IP', 'the below files', 'The below files', 'the below paths',
                      'The below paths', 'the below path', 'The below path', 'the below file', 'The below file', 'the following', 'The following', 'the below IPs',
                      'The below IPs', 'the registry', 'The registry', 'following folders and subfolders', 'Following folders and subfolders',
                      'following folders and file(s)', 'Following folders and file(s)', 'following registry locations', 'Following registry locations',
                      'following registry location', 'Following registry location', 'following folders and files', 'Following folders and files',
                      'following registry subkeys', 'Following registry subkeys', 'following folders and file', 'Following folders and file',
                      'following registry entries', 'Following registry entries', 'following registry subkey', 'Following registry subkey',
                      'following registry entry', 'Following registry entry', 'following directories', 'Following directories', 'following subfolders',
                      'Following subfolders', 'following directory', 'Following directory', 'following locations', 'Following locations', 'following location',
                      'Following location', 'following file(s)', 'Following file(s)', 'below directories', 'Below directories', 'following folders',
                      'Following folders', 'following folder', 'Following folder', 'below directory', 'Below directory', 'below locations', 'Below locations',
                      'following files', 'Following files', 'following paths', 'Following paths', 'registry value', 'Registry value', 'following URLs',
                      'Following URLs', 'following file', 'Following file', 'following path', 'Following path', 'below location', 'Below location', 'below websites',
                      'Below websites', 'following URL', 'Following URL', 'below file(s)', 'Below file(s)', 'below paths', 'Below paths', 'below files',
                      'Below files', 'below file', 'Below file', 'below path', 'Below path', 'below IPs', 'Below IPs']

##### Verb lists

In [None]:
verbs_lists_homogenization = [
    ['unlink', 'delete', 'clear', 'remove', 'erase', 'wipe', 'purge', 'expunge'],
    ['write', 'entrench', 'entrenches', 'entrenched', 'exfiltrate', 'exfiltrates', 'exfiltrated', 'exfil', 'exfils', 'exfiled', 'store', 'drop', 'install', 'place', 'deploy', 'implant', 'putfile', 'compose', 'create', 'copy', 'save', 'add', 'append', 'form'],
    ['read', 'survey', 'download', 'navigate', 'locate', 'gather', 'extract', 'obtain', 'acquire', 'check', 'detect', 'record'],
    ['exec', 'use', 'execute', 'run', 'launch', 'call', 'perform', 'list', 'invoke', 'inject', 'open', 'target', 'resume'],
    ['mmap', 'mmaps', 'mmap\'d', 'mmaped', 'allocate', 'assign'],
    ['fork', 'clone', 'spawn', 'issue', 'set'],
    ['setuid', 'elevate', 'impersonate'],
    ['send', 'transfer', 'post', 'postsinformation', 'postsinformations', 'move', 'transmit', 'deliver', 'push', 'redirect', 'beacon', 'beacons', 'beaconed'],
    ['receive', 'accept', 'take', 'get', 'collect'],
    ['connect', 'click', 'browse', 'portscan', 'alert', 'communicate'],
    ['change', 'modify'],
    ['exit', 'terminate', 'stop', 'end', 'finish', 'abort', 'conclude']
]

#### Pronouns and Subject ellipsis resolution

In [None]:
ellipsis_verbs = ['makes', 'make' 'tries', 'Try' ,'adds', 'add', 'runs', 'run', 'deletes',
                  'delete', 'removes', 'remove', 'registers', 'register', 'replicates',
                  'replicate', 'creates', 'create', 'executes', 'execute', 'modifies',
                  'modify', 'downloads', 'download', 'spreads', 'spread', 'conducts',
                  'conduct', 'copies', 'copy', 'forks', 'fork', 'writes', 'write', 'reads',
                  'read', 'retrieves', 'retrieve', 'redirects', 'redirect', 'wipes', 'wipe',
                  'exfiltrates', 'exfiltrate', 'installs', 'install', 'deploys', 'deploy',
                  'clears', 'clear', 'erases', 'erase', 'drops', 'drop', 'entrenches',
                  'entrench', 'runs', 'run', 'collects', 'collect', 'writes', 'write',
                  'locates', 'locate', 'allocates', 'allocate', 'clones', 'clone', 'uses',
                  'use', 'performs', 'perform', 'spawns', 'spawn', 'issues', 'issue', 'sets',
                  'set', 'clones', 'clone', 'executes', 'execute', 'launches', 'launch',
                  'saves', 'save', 'adds', 'add', 'extracts', 'extract', 'gets', 'get',
                  'injects', 'inject', 'obtains', 'obtain', 'gathers', 'gather', 'downloads',
                  'download', 'beacons', 'beacon', 'places', 'place', 'navigates', 'navigate',
                  'composes', 'compose', 'acquires', 'acquire', 'browses', 'browse',
                  'performs', 'perform', 'opens', 'open', 'sends', 'send', 'targets',
                  'target', 'accepts', 'accept', 'receives', 'receive', 'transfers',
                  'transfer', 'invokes', 'invoke', 'modify', 'modifies', 'connects',
                  'connect', 'communicates', 'communicate', 'posts', 'post', 'propagates',
                  'propagate', 'terminates', 'terminate', 'monitors', 'monitor', 'attempts',
                  'attempt', 'generates', 'generate', 'searches', 'search', 'contains',
                  'contain', 'hides', 'hide', 'infects', 'infect', 'appends', 'append',
                  'closes', 'close', 'checks', 'check']

### Functions

#### Setup

In [None]:
# Lib conjugate has some issues at first iteration
def pattern_stopiteration_workaround():
    try:
        en.tenses('crying')
    except:
        pass
pattern_stopiteration_workaround()

def pattern_stopiteration_workaround():
    try:
        conjugate(verb='crying',tense=INFINITIVE)
    except:
        pass
pattern_stopiteration_workaround()

#### <font color='yellow'>Sanitization</font>

##### Unrelated content removal

In [None]:
def analyze_sentence(sentence, tags='', update_tags=False):
  # Apply tokenization as for the training part
  inputs = tokenizer.encode_plus(
      sentence,
      add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
      max_length = window_length,     # Pad & truncate all sentences
      pad_to_max_length = True,
      return_attention_mask = True,   # Construct attn. masks
      return_tensors = 'pt',          # Return pytorch tensors
  )

  inputs = {key: value.to(device) for key, value in inputs.items()}

  # Make the prediction
  with torch.no_grad():
      outputs = model(**inputs)
      logits = outputs.logits

  probabilities = torch.softmax(logits, dim=1)
  predicted_class = torch.argmax(probabilities, dim=1).item()

  if predicted_class == 1:
    if update_tags:
      return sentence, tags
    return sentence
  elif predicted_class == 0:
    if update_tags:
      return '', ''
    return ''

##### IOC Defanging

In [None]:
def defang_iocs_in_text(sentence):
    defanged_text = ioc_fanger.defang(sentence)
    return defanged_text

def fang_iocs_in_text(sentence):
    fanged_text = ioc_fanger.fang(sentence)
    return fanged_text

##### Misspelling Correction

In [None]:
def spell_check(sentence):
  s_split = sentence.split()
  text = ' '.join(s_split[:150])
  remaining = ' '.join(s_split[150:])
  text = jsp.FixFragment(text) + ' ' + remaining
  return text

#### <font color='yellow'>Text Normalization</font>

##### Passive\Active Conversion

In [None]:
def pass2act_nouninv(noun):
    n = noun.lower()
    if n in pass2act_noundict:
        return pass2act_noundict[n]
    return noun

In [None]:
def pass2act(doc, rec=False):
  parse = pass2act_nlp(doc)
  newdoc = ''
  is_passive = False
  for sent in parse.sents:

      # Init parts of sentence to capture:
      subjpass = ''
      subj = ''
      verb = ''
      verbtense = ''
      adverb = {'bef':'', 'aft':''}
      part = ''
      prep = ''
      agent = ''
      aplural = False
      advcltree = None
      aux = list(list(pass2act_nlp('. .').sents)[0]) # start with 2 'null' elements
      xcomp = ''
      punc = '.'
      # Analyse dependency tree:
      for word in sent:
          if word.dep_ == 'advcl':
              if word.head.dep_ in ('ROOT', 'auxpass'):
                  advcltree = word.subtree
          if word.dep_ == 'nsubjpass':
              if word.head.dep_ == 'ROOT':
                  subjpass = ''.join(w.text_with_ws for w in word.subtree).strip()
          if word.dep_ == 'nsubj':
              subj = ''.join(w.text_with_ws for w in word.subtree).strip()
              if word.head.dep_ == 'auxpass':
                  if word.head.head.dep_ == 'ROOT':
                      subjpass = subj
          if word.dep_ in ('advmod','npadvmod','oprd'):
              if word.head.dep_ == 'ROOT':
                  if verb == '':
                      adverb['bef'] = ''.join(w.text_with_ws for w in word.subtree).strip()
                  else:
                      adverb['aft'] = ''.join(w.text_with_ws for w in word.subtree).strip()
          if word.dep_ == 'auxpass':
              if word.head.dep_ == 'ROOT':
                  if not subjpass:
                      subjpass = subj
          if word.dep_ in ('aux','auxpass','neg'):
              if word.head.dep_ == 'ROOT':
                  aux += [word]
          if word.dep_ == 'ROOT':
              verb = word.text
              if word.tag_ == 'VB':
                  verbtense = en.INFINITIVE
              elif word.tag_ == 'VBD':
                  verbtense = en.PAST
              elif word.tag_ == 'VBG':
                  verbtense = en.PRESENT
                  verbaspect = en.PROGRESSIVE
              elif word.tag_ == 'VBN':
                  verbtense = en.PAST
              else:
                  try:
                      verbtense = en.tenses(word.text)[0][0]
                  except IndexError:
                      pass
          if word.dep_ == 'prt':
              if word.head.dep_ == 'ROOT':
                  part = ''.join(w.text_with_ws for w in word.subtree).strip()
          if word.dep_ == 'prep':
              if word.head.dep_ == 'ROOT':
                  prep = ''.join(w.text_with_ws for w in word.subtree).strip()
          if word.dep_.endswith('obj'):
              if word.head.dep_ == 'agent':
                  if word.head.head.dep_ == 'ROOT':
                      agent = ''.join(w.text + ', ' if w.dep_=='appos' else (w.text_with_ws) for w in word.subtree).strip()
                      aplural = word.tag_ in ('NNS','NNPS')
          if word.dep_ in ('xcomp','ccomp','conj'):
              if word.head.dep_ == 'ROOT':
                  xcomp = ''.join(w.text_with_ws for w in word.subtree).strip()
                  that = xcomp.startswith('that')
                  #xcomp = pass2act(xcomp, True).strip(' .')
                  xcomp, check_is_passive = pass2act(xcomp, True)
                  xcomp = xcomp.strip(' .')
                  if not is_passive:
                    is_passive = check_is_passive
                  if not xcomp.startswith('that') and that:
                      xcomp = 'that '+xcomp
          if word.dep_ == 'punct' and not rec:
              if word.text != '"':
                  punc = word.text

      # exit if not passive:
      if subjpass == '':
          newdoc += str(sent) + ' '
          continue

      # if no agent is found:
      if agent == '':
          # what am I gonna do? BITconEEEEEEECT!!!!
          newdoc += str(sent) + ' '
          continue

      # invert nouns:
      is_passive = True
      agent = pass2act_nouninv(agent)
      subjpass = pass2act_nouninv(subjpass)

      # FUCKING CONJUGATION!!!!!!!!!!!!!:
      auxstr = ''
      num = en.SINGULAR if not aplural or agent in ('he','she') else en.PLURAL
      aux.append(aux[0])
      verbaspect = None

      for (pp, p, a, n) in zip(aux,aux[1:],aux[2:],aux[3:]):
          if a.lemma_ == '.':
              continue

          if a.lemma_ == 'not':
              if p.lemma_ == 'be':
                  if n.lemma_ == 'be':
                      verbtense = en.tenses(a.text)[0][0]
                      auxstr += en.conjugate('be',tense=en.tenses(p.text)[0][0],number=num) + ' '
                      verbaspect = en.PROGRESSIVE
                  else:
                    if len(en.tenses(p.text)) > 0 and len(en.tenses(p.text)[0]) > 0:
                      auxstr += en.conjugate('do',tense=en.tenses(p.text)[0][0],number=num) + ' '
                    else:
                      auxstr += p.text + ' '
                    verbtense = en.INFINITIVE
              auxstr += 'not '
          elif a.lemma_ == 'be':
              if p.lemma_ == 'be':
                  verbtense = en.tenses(a.text)[0][0]
                  auxstr += en.conjugate('be',tense=en.tenses(a.text)[0][0],number=num) + ' '
                  verbaspect = en.PROGRESSIVE
              elif p.tag_ == 'MD':
                  verbtense = en.INFINITIVE
          elif a.lemma_ == 'have':
              num == en.PLURAL if p.tag_ == 'MD' else num
              if len(en.tenses(a.text)) > 0:
                auxstr += en.conjugate('have',tense=en.tenses(a.text)[0][0],number=num) + ' '
              else:
                auxstr += a.text + ' '
              if n.lemma_ == 'be':
                  verbaspect = en.PROGRESSIVE
                  verbtense = en.tenses(n.text)[0][0]
          else:
              auxstr += a.text_with_ws
      auxstr = auxstr.lower().strip()

      if verbaspect:
          verb = en.conjugate(verb,tense=verbtense,aspect=verbaspect)
      else:
          verb = en.conjugate(verb,tense=verbtense)

      advcl = ''
      if advcltree:
          for w in advcltree:
              if w.pos_ == 'VERB' and len(en.tenses(w.text)) > 0 and len(en.tenses(w.text)[0]) > 0 and en.tenses(w.text)[0][4] == en.PROGRESSIVE:
                  advcl += 'which ' + en.conjugate(w.text,tense=en.tenses(verb)[0][0]) + ' '
              else:
                  advcl += w.text_with_ws

      newsent = ' '.join(list(filter(None, [agent,auxstr,adverb['bef'],verb,part,subjpass,adverb['aft'],advcl,prep,xcomp])))+punc
      if not rec:
          newsent = newsent[0].upper() + newsent[1:]
      newdoc += newsent + ' '
  return newdoc, is_passive

In [None]:
# Define a function to remove punctuation
def remove_punctuation(sentence):
  return ''.join(char for char in sentence if char not in string.punctuation)

def convert_active_to_passive(sentence, tags='', update_tags=False):
  stripped_sentence = ' '.join([word for word in sentence.split() if word not in string.punctuation])  # Strip tokens of single punctuation

  # Apply conversion
  new_sentence, is_passive = pass2act(stripped_sentence)

  if not is_passive or new_sentence == stripped_sentence: # Return original sentence if no modification
    if update_tags:
      return sentence, tags
    return sentence

  # If there are modifications, use the stripped sentence
  if update_tags:
    stripped_tags = []
    tags_split = tags.split(',')
    for idx, word in enumerate(sentence.split()):
      if word not in string.punctuation:
        stripped_tags.append(tags_split[idx])

  sentence = stripped_sentence

  if update_tags:
    tags_split = stripped_tags
    tags = ','.join(stripped_tags)
    new_tags_split = []

    """print('sentence', sentence, '-', len(sentence.split()))
    print('tags_split', tags_split, '-', len(tags_split))
    print()"""

    new_sentence.split()
    new_sentence = ' '.join([word for word in new_sentence.split() if word not in string.punctuation])

    # Remove punctuation from both sentences
    cleaned_sentence1 = remove_punctuation(sentence).lower()
    cleaned_sentence2 = remove_punctuation(new_sentence).lower()

    # Tokenize the cleaned sentences
    tokens1 = cleaned_sentence1.split()
    tokens2 = cleaned_sentence2.split()

    # Find the common words between the two sentences
    common_words = set(tokens1) & set(tokens2)

    # Find words that are in sentence 1 but not in sentence 2 (deleted words)
    deleted_words = set(tokens1) - set(tokens2)
    deleted_words = [word for word in deleted_words if word not in string.punctuation]

    # Check if in deleted words there is any of noundict that was changed in active form
    pronouns_changed = False
    for d_w in deleted_words:
      if d_w in pass2act_noundict:
        pronouns_changed = True
        d_w_changed = pass2act_noundict[d_w]
        original_position = tokens1.index(d_w)
        tokens1[original_position] = d_w_changed

    if pronouns_changed:  # If there is at least a pronoun, do some steps again
      # Find the common words between the two sentences
      common_words = set(tokens1) & set(tokens2)

      # Find words that are in sentence 1 but not in sentence 2 (deleted words)
      deleted_words = set(tokens1) - set(tokens2)
      deleted_words = [word for word in deleted_words if word not in string.punctuation]

    # Find words that are in sentence 2 but not in sentence 1 (added words)
    added_words = set(tokens2) - set(tokens1)
    added_words = [word for word in added_words if word not in string.punctuation]

    # If there are added words, the only possibility is that is the verb that changed form, so check if it hase the same lemmatized form of a deleted word
    verbs_changed = False
    add_not_token = False
    if added_words:
        added_words_lower = [word.lower() for word in added_words]
        deleted_words_lower = [word.lower() for word in deleted_words]
        tokens1_lower = [word.lower() for word in tokens1]

        for word in added_words:
            #print(f'\'{word}\' is added')

            # Check particular modal cases
            if word.lower() == 'canot':
              d_w = 'cant'
              d_w_changed = word
              original_position = tokens1_lower.index(d_w)
              tokens1[original_position] = d_w_changed
              #print(f"Words in sentence 2: '{word}' is modified and it was \'{d_w}\' in position '{original_position}'")
              verbs_changed = True
              continue
            elif word.lower() == 'couldnot':
              d_w = 'couldnt'
              d_w_changed = word
              original_position = tokens1_lower.index(d_w)
              tokens1[original_position] = d_w_changed
              #print(f"Words in sentence 2: '{word}' is modified and it was \'{d_w}\' in position '{original_position}'")
              verbs_changed = True
              continue
            elif word.lower() == 'mustnot':
              d_w = 'mustnt'
              d_w_changed = word
              original_position = tokens1_lower.index(d_w)
              tokens1[original_position] = d_w_changed
              #print(f"Words in sentence 2: '{word}' is modified and it was \'{d_w}\' in position '{original_position}'")
              verbs_changed = True
              continue
            elif word.lower() == 'shouldnot':
              d_w = 'shouldnt'
              d_w_changed = word
              original_position = tokens1_lower.index(d_w)
              tokens1[original_position] = d_w_changed
              #print(f"Words in sentence 2: '{word}' is modified and it was \'{d_w}\' in position '{original_position}'")
              verbs_changed = True
              continue

            # Check lemmatization
            lemm_added_word = WordNetLemmatizer().lemmatize(word.lower(), 'v')
            for d_w in deleted_words:
              lemm_deleted_word = WordNetLemmatizer().lemmatize(d_w.lower(), 'v')
              if lemm_added_word == lemm_deleted_word:
                #print(f'MATCH! : {word} --> {lemm_added_word} == {lemm_deleted_word} <-- {d_w}')
                verbs_changed = True
                d_w_changed = word
                original_position = tokens1.index(d_w)
                tokens1[original_position] = d_w_changed
                #print(f"Words in sentence 2: '{word}' is modified and it was \'{d_w}\' in position '{original_position}'")

        # Check particular cases with be --> do
        if 'does' in added_words_lower:
          if 'not' in added_words_lower:
            if 'arent' in deleted_words_lower:
              d_w = 'arent'
              d_w_changed = 'does'
              original_position = tokens1_lower.index(d_w)
              tokens1[original_position] = d_w_changed
              #print(f"Words in sentence 2: 'does not' is modified and it was \'{d_w}\' in position '{original_position}'")
              verbs_changed = True
              tokens1 = tokens1 + ['not']
              add_not_token = True
            elif 'isnt' in deleted_words_lower:
              d_w = 'isnt'
              d_w_changed = 'does'
              original_position = tokens1_lower.index(d_w)
              tokens1[original_position] = d_w_changed
              #print(f"Words in sentence 2: 'does not' is modified and it was \'{d_w}\' in position '{original_position}'")
              verbs_changed = True
              tokens1 = tokens1 + ['not']
              add_not_token = True
          elif 'are' in deleted_words_lower:
            d_w = 'are'
            d_w_changed = 'does'
            original_position = tokens1_lower.index(d_w)
            tokens1[original_position] = d_w_changed
            #print(f"Words in sentence 2: 'does' is modified and it was \'{d_w}\' in position '{original_position}'")
            verbs_changed = True
          elif 'is' in deleted_words_lower:
            d_w = 'is'
            d_w_changed = 'does'
            original_position = tokens1_lower.index(d_w)
            tokens1[original_position] = d_w_changed
            #print(f"Words in sentence 2: 'does' is modified and it was \'{d_w}\' in position '{original_position}'")
            verbs_changed = True
        elif 'do' in added_words_lower:
          if 'not' in added_words_lower:
            if 'arent' in deleted_words_lower:
              d_w = 'arent'
              d_w_changed = 'do'
              original_position = tokens1_lower.index(d_w)
              tokens1[original_position] = d_w_changed
              #print(f"Words in sentence 2: 'do not' is modified and it was \'{d_w}\' in position '{original_position}'")
              verbs_changed = True
              tokens1 = tokens1 + ['not']
              add_not_token = True
            elif 'isnt' in deleted_words_lower:
              d_w = 'isnt'
              d_w_changed = 'do'
              original_position = tokens1_lower.index(d_w)
              tokens1[original_position] = d_w_changed
              #print(f"Words in sentence 2: 'do not' is modified and it was \'{d_w}\' in position '{original_position}'")
              verbs_changed = True
              tokens1 = tokens1 + ['not']
              add_not_token = True
          elif 'are' in deleted_words_lower:
            d_w = 'are'
            d_w_changed = 'do'
            original_position = tokens1_lower.index(d_w)
            tokens1[original_position] = d_w_changed
            #print(f"Words in sentence 2: 'do' is modified and it was \'{d_w}\' in position '{original_position}'")
            verbs_changed = True
          elif 'is' in deleted_words_lower:
            d_w = 'is'
            d_w_changed = 'do'
            original_position = tokens1_lower.index(d_w)
            tokens1[original_position] = d_w_changed
            #print(f"Words in sentence 2: 'do' is modified and it was \'{d_w}\' in position '{original_position}'")
            verbs_changed = True


    if verbs_changed:  # If there is at least a verb changed, do some steps again
      # Find the common words between the two sentences
      common_words = set(tokens1) & set(tokens2)

      # Find words that are in sentence 1 but not in sentence 2 (deleted words)
      deleted_words = set(tokens1) - set(tokens2)
      deleted_words = [word for word in deleted_words if word not in string.punctuation]


    # Save idx of deleted words
    deleted_idx = []
    if deleted_words:
        for word in deleted_words:
            original_position = [i for i, token in enumerate(tokens1) if token == word]
            deleted_idx.extend(original_position)
            #print(f"Deleted words in sentence 2: '{word}' that was in position '{original_position}'")


    # Check if there is some words in common_words that has different lenght between old and new sentences
    for word in common_words:
        position1 = [i for i, token in enumerate(tokens1) if token == word]
        position2 = [i for i, token in enumerate(tokens2) if token == word]
        if len(position1) > len(position2):
          diff = len(position1) - len(position2)
          count = 0
          for i, token in enumerate(tokens1):
            if count >= diff:
              break
            if token == word:
              deleted_idx.extend([i])
              #print(f"Deleted words in sentence 2: '{word}' that was in position '{i}'")
              count += 1

    # Remove deleted words from the tokens of the second sentence
    deleted_idx.sort()
    for i in range(len(tags_split)):
      if i not in deleted_idx:
        new_tags_split.append(tags_split[i])

    if add_not_token:
      tags_split = tags_split + ['O']
      new_tags_split = new_tags_split + ['O']

    # Compare the common words in both sentences
    for word in common_words:
        position1 = [i for i, token in enumerate(tokens1) if token == word]
        position2 = [i for i, token in enumerate(tokens2) if token == word]

        if position1 != position2:
            # Adjust tokens of the second sentence
            try:
              for idx1, idx2 in zip(position1, position2):
                new_tags_split[idx2] = tags_split[idx1]
              #print(f"Word '{word}' was in position(s) {position1} in sentence 1, but in position(s) {position2} in sentence 2")
            except:
              """print('\nERROR during comparison')
              print('Original sentence:', sentence)
              print('New sentence     :', new_sentence)
              print()"""
              return sentence, tags
        elif len(position1) == 1 and len(position2) == 1 and position1 == position2 and position2[0] < len(new_tags_split) and position1[0] < len(tags_split) and new_tags_split[position2[0]] != tags_split[position1[0]]:  # Same position but different tags
            new_tags_split[position2[0]] = tags_split[position1[0]]

    """print()
    print(new_sentence, '-', len(new_sentence.split()))
    print(new_tags_split, '-', len(new_tags_split))"""

    if len(new_sentence.split()) != len(new_tags_split):
      #print("\nERROR DIFFERENT LENGHT --> sentence_len:", len(new_sentence.split()), "!= tags_len:", len(new_tags_split), "w\\", new_sentence.split(), 'and', new_tags_split)
      return sentence, tags
    return new_sentence, ','.join(new_tags_split)

  return new_sentence

##### Synonym Homogenization

In [None]:
def get_noun_lists():
  return appdata_list,temp_list,userprofile_list,systemroot_list,profilename_list,username_list,common_appdata_list,random_letters_list,random_numbers_list,systemdrives_list,system32_list,system_list,windir_list,windows_list, empty_list,programfiles_list,programfile_list,defaultuserprofile,homefolder_list,homepath_list,command_and_control_list

noun_lists = get_noun_lists()

In [None]:
def get_noun_regex_list(idx):

  pattern_basic = []
  pattern_special = []
  for el in noun_lists[idx][1:]:
    if '<' in el or '>' in el or '%' in el or '&' in el:  # Set a special pattern for keywords with characters not recognized by '\b'
      pattern_special.append(el)
      continue
    pattern_basic.append(el)

  if len(pattern_basic) > 0 and len(pattern_special) > 0:
    pattern_basic = r'\b(?:' + '|'.join(map(re.escape, pattern_basic)) + r')\b'
    pattern_special = '|'.join(map(re.escape, pattern_special))
    pattern = pattern_basic +'|'+ pattern_special

  elif len(pattern_basic) > 0:
    pattern = r'\b(?:' + '|'.join(map(re.escape, pattern_basic)) + r')\b'

  elif len(pattern_special) > 0:
    pattern = '|'.join(map(re.escape, pattern_special))

  return re.compile(pattern, re.IGNORECASE)

def get_noun_regex_lists():
  return get_noun_regex_list(0),get_noun_regex_list(1),get_noun_regex_list(2),get_noun_regex_list(3),get_noun_regex_list(4),get_noun_regex_list(5),get_noun_regex_list(6),get_noun_regex_list(7),get_noun_regex_list(8),get_noun_regex_list(9),get_noun_regex_list(10),get_noun_regex_list(11),get_noun_regex_list(12),get_noun_regex_list(13),get_noun_regex_list(14),get_noun_regex_list(15),get_noun_regex_list(16),get_noun_regex_list(17),get_noun_regex_list(18),get_noun_regex_list(19),get_noun_regex_list(20)

noun_regex_lists = get_noun_regex_lists()

In [None]:
"""
Given a list of tags, return the element with more occurences
If there are more tags, it excludes 'O' tags in order to give more importance to entities
"""
def get_major_tag(tags):
  tag_list = []
  for el in tags:
    if len(el) > 1:
      tag_list.append(el[2:])  # Remove prefix and keep entity
    else:
      tag_list.append(el)

  element_counts = Counter(tag_list)  # Count the occurrences of each element in the list

  if len(element_counts) > 1 and 'O' in element_counts:
    del element_counts['O']

  highest_count = max(element_counts.values())  # Find the highest count in the Counter
  most_common_elements = [element for element, count in element_counts.items() if count == highest_count]  # Get all elements with the highest count

  most_common_element = most_common_elements[0]

  if most_common_element != 'O':
    return 'B-' + most_common_element  # Format the tags

  return most_common_element  # 'O'

In [None]:
"""
  tags=[], update_tags=False are optional since the function could be called during the prediction-phase
  During the train-phase, just specify the arguments to update the tags associated to the sentence

"""
def homogenization_noun(sentence, tags='', update_tags=False):
  new_sentence = sentence

  sentence_split = sentence.split()
  len_sentence_split = len(sentence_split)

  tags_split = tags.split(',')
  len_tags_split = len(tags_split)

  tags_to_change_list = []

  for var, big_regex in zip(noun_lists, noun_regex_lists):
    new_sentence = big_regex.sub(var[0], new_sentence)  # Apply regex

    if update_tags:  # Training phase - need to adapt tags
      matched_words = []
      matched_words.extend(re.findall(big_regex, sentence))  # Save all the matches to change entity tags

      for matched_word in set(matched_words):
        len_matched_word = len(matched_word.split())  # Count number of words in the keyword

        for idx, _ in enumerate(sentence_split):  # Search the matched keyword in the original sentence and retrieve indexes
          if idx + len_matched_word > len_sentence_split:
            break

          if '<' in matched_word or '>' in matched_word or '%' in matched_word or '&' in matched_word:
            pattern = re.escape(matched_word)
          else:
            pattern = r'\b{}\b'.format(re.escape(matched_word))

          if re.search(pattern, ' '.join(sentence_split[idx:idx+len_matched_word]), re.IGNORECASE):
            tags_to_change_list.append({  # Save indexes of the keyword to update tags
              'word_to_change': matched_word,
              'tags-idx': [i for i in range(idx, idx+len_matched_word)],
              'final-tag': get_major_tag(tags_split[idx:idx+len_matched_word])
            })

  if update_tags:
    tags_to_change_list = sorted(tags_to_change_list, key=lambda x: x['tags-idx'][0])  # Sort indexes from the first appearance to the last in the sentence

    new_tags = []
    idx = 0
    while len(tags_to_change_list) > 0:  # Update tags
      dict_word_to_change = tags_to_change_list[0]
      word_to_change = dict_word_to_change['word_to_change']
      tags_idx = dict_word_to_change['tags-idx']
      final_tag = dict_word_to_change['final-tag']

      while idx not in tags_idx:
        new_tags.append(tags_split[idx])
        idx+=1

      new_tags.append(final_tag)
      idx = tags_idx[-1] + 1

      tags_to_change_list.pop(0)

      while len(tags_to_change_list) > 0 and tags_idx[-1] >= tags_to_change_list[0]['tags-idx'][0]:  # Check if there are more matches in the some words and skip them
          tags_to_change_list.pop(0)

    while idx < len(tags_split):  # Copy remaining tags
      new_tags.append(tags_split[idx])
      idx += 1

    if len(new_tags) != len(new_sentence.split()):
      #print("\nERROR DIFFERENT LENGHT AT NOUN STEP --> sentence_len:", len(new_sentence.split()), "!= tags_len:", len(new_tags), "w\\", new_sentence.split(), 'and', new_tags)
      return sentence, tags

    new_tags = ','.join(new_tags)
    return new_sentence, new_tags

  return new_sentence

In [None]:
"""
  tags=[], update_tags=False are optional since the function could be called during the prediction-phase
  During the train-phase, just specify the arguments to update the tags associated to the sentence

"""
def homogenization_try_going_to(sentence, tags='', update_tags=False):
  new_sentence = sentence

  sentence_split = sentence.split()
  len_sentence_split = len(sentence_split)

  tags_split = tags.split(',')
  len_tags_split = len(tags_split)

  tags_to_change_list = []

  for element in try_going_to_list:
    if element in new_sentence.lower():
        match_el = re.search(element+'[ ]+(\S+)', new_sentence, re.IGNORECASE)

        if match_el:
          verb = match_el.group(1)
          matched_group = match_el.group()
          new_sentence = new_sentence.replace(matched_group, verb)

          if update_tags:
            len_matched_group = len(matched_group.split())  # Count number of words in the keyword

            for idx, _ in enumerate(sentence_split):  # Search the matched keyword in the original sentence and retrieve indexes
              if idx + len_matched_group > len_sentence_split:
                break

              if re.search(matched_group, ' '.join(sentence_split[idx:idx+len_matched_group]), re.IGNORECASE):
                tags_to_change_list.append({  # Save indexes of the keyword to update tags
                  'word_to_change': matched_group,
                  'tags-idx': [i for i in range(idx, idx+len_matched_group-1)]
                })

  if update_tags:
    tags_to_change_list = sorted(tags_to_change_list, key=lambda x: x['tags-idx'][0])  # Sort indexes from the first appearance to the last in the sentence

    new_tags = []
    idx = 0
    while len(tags_to_change_list) > 0:  # Update tags
      dict_word_to_change = tags_to_change_list[0]
      word_to_change = dict_word_to_change['word_to_change']
      tags_idx = dict_word_to_change['tags-idx']

      while idx not in tags_idx:
        new_tags.append(tags_split[idx])
        idx+=1

      idx = tags_idx[-1] + 1

      tags_to_change_list.pop(0)

      while len(tags_to_change_list) > 0 and tags_idx[-1] >= tags_to_change_list[0]['tags-idx'][0]:  # Check if there are more matches in the some words and skip them
          tags_to_change_list.pop(0)

    while idx < len(tags_split):  # Copy remaining tags
      new_tags.append(tags_split[idx])
      idx += 1

    if len(new_tags) != len(new_sentence.split()):
      #print("\nERROR DIFFERENT LENGHT AT TRY_TO STEP --> sentence_len:", len(new_sentence.split()), "!= tags_len:", len(new_tags), "w\\", new_sentence.split(), 'and', new_tags)
      return sentence, tags

    new_tags = ','.join(new_tags)
    return new_sentence, new_tags

  return new_sentence

In [None]:
def custom_partition(sentence, element_split):  # To preserve original case
    # Find the index of the element in a case-insensitive manner
    index = sentence.lower().find(element_split.lower())

    # Determine the original casing of the element
    element = sentence[index:index + len(element_split)].strip()

    # Split the sentence into three parts: text before, element, and text after
    before = sentence[:index].strip()
    after = sentence[index + len(element_split):].strip()

    # Reconstruct the result with the original casing
    return before, element, after


"""
  tags=[], update_tags=False are optional since the function could be called during the prediction-phase
  During the train-phase, just specify the arguments to update the tags associated to the sentence

"""
def homogenization_is_capable_of(sentence, tags='', update_tags=False):
  new_sentence = sentence

  sentence_split = sentence.split()
  len_sentence_split = len(sentence_split)

  tags_split = tags.split(',')
  len_tags_split = len(tags_split)

  tags_to_change_list = []

  for element in capable_of_list:
    if element in new_sentence.lower():
      before_element, element, after_element = custom_partition(new_sentence, element)  # new_sentence.partition(element)

      verb_ing = after_element.split()[0].lower()
      remaining_after_element = ' '.join(after_element.split()[1:])

      # Reconstruct phrase to preserve original case but lowercase verb to analyze to avoid errors
      new_sentence = before_element + ' ' + element.lower() + ' ' +  verb_ing + ' ' + remaining_after_element

      token = new_sentence.split()
      for i in nltk.pos_tag(token):
        if i[0] == verb_ing and i[1] == 'VBG':
          verb_present_form = conjugate(verb=verb_ing,tense=INFINITIVE)
          new_sentence = before_element + " " + verb_present_form + " " + remaining_after_element

      if update_tags:
        matched_group = element + ' ' + verb_ing
        len_matched_group = len(matched_group.split())  # Count number of words in the keyword

        for idx, _ in enumerate(sentence_split):  # Search the matched keyword in the original sentence and retrieve indexes
          if idx + len_matched_group > len_sentence_split:
            break

          if re.search(matched_group, ' '.join(sentence_split[idx:idx+len_matched_group]), re.IGNORECASE):
            tags_to_change_list.append({  # Save indexes of the keyword to update tags
              'word_to_change': matched_group,
              'tags-idx': [i for i in range(idx, idx+len_matched_group-1)]
            })

  if update_tags:
    tags_to_change_list = sorted(tags_to_change_list, key=lambda x: x['tags-idx'][0])  # Sort indexes from the first appearance to the last in the sentence

    new_tags = []
    idx = 0
    while len(tags_to_change_list) > 0:  # Update tags
      dict_word_to_change = tags_to_change_list[0]
      word_to_change = dict_word_to_change['word_to_change']
      tags_idx = dict_word_to_change['tags-idx']

      while idx not in tags_idx:
        new_tags.append(tags_split[idx])
        idx+=1

      idx = tags_idx[-1] + 1

      tags_to_change_list.pop(0)

      while len(tags_to_change_list) > 0 and tags_idx[-1] >= tags_to_change_list[0]['tags-idx'][0]:  # Check if there are more matches in the some words and skip them
          tags_to_change_list.pop(0)

    while idx < len(tags_split):  # Copy remaining tags
      new_tags.append(tags_split[idx])
      idx += 1

    if len(new_tags) != len(new_sentence.split()):
      #print("\nERROR DIFFERENT LENGHT AT CAPABLE_OF STEP --> sentence_len:", len(new_sentence.split()), "!= tags_len:", len(new_tags), "w\\", new_sentence.split(), 'and', new_tags)
      return sentence, tags

    new_tags = ','.join(new_tags)
    return new_sentence, new_tags

  return new_sentence

In [None]:
def custom_split(sentence, element):
  # Find the index of the substring 'element' in a case-insensitive manner
  index = sentence.lower().find(element.lower())

  before = sentence[:index]
  after =  sentence[index+len(element):]

  return before, after

"""
  tags=[], update_tags=False are optional since the function could be called during the prediction-phase
  During the train-phase, just specify the arguments to update the tags associated to the sentence

"""
def homogenization_modification(sentence, tags='', update_tags=False):
  new_sentence = sentence

  sentence_split = sentence.split()
  len_sentence_split = len(sentence_split)

  tags_split = tags.split(',')
  len_tags_split = len(tags_split)

  tags_to_change_list = []

  for element in make_modification_list:
    if element in new_sentence.lower():
      before, after = custom_split(new_sentence, element)
      new_sentence = before + 'modify' + after

      if update_tags:
        len_element = len(element.split())  # Count number of words in the keyword

        for idx, _ in enumerate(sentence_split):  # Search the matched keyword in the original sentence and retrieve indexes
          if idx + len_element > len_sentence_split:
            break

          if re.search(element, ' '.join(sentence_split[idx:idx+len_element]), re.IGNORECASE):
            tags_to_change_list.append({  # Save indexes of the keyword to update tags
              'word_to_change': element,
              'tags-idx': [i for i in range(idx, idx+len_element)],
              'final-tag': 'O'
            })

  if update_tags:
    tags_to_change_list = sorted(tags_to_change_list, key=lambda x: x['tags-idx'][0])  # Sort indexes from the first appearance to the last in the sentence

    new_tags = []
    idx = 0
    while len(tags_to_change_list) > 0:  # Update tags
      dict_word_to_change = tags_to_change_list[0]
      word_to_change = dict_word_to_change['word_to_change']
      tags_idx = dict_word_to_change['tags-idx']
      final_tag = dict_word_to_change['final-tag']

      while idx not in tags_idx:
        new_tags.append(tags_split[idx])
        idx+=1

      new_tags.append(final_tag)
      idx = tags_idx[-1] + 1

      tags_to_change_list.pop(0)

      while len(tags_to_change_list) > 0 and tags_idx[-1] >= tags_to_change_list[0]['tags-idx'][0]:  # Check if there are more matches in the some words and skip them
          tags_to_change_list.pop(0)

    while idx < len(tags_split):  # Copy remaining tags
      new_tags.append(tags_split[idx])
      idx += 1

    if len(new_tags) != len(new_sentence.split()):
      #print("\nERROR DIFFERENT LENGHT AT MODIFICATION STEP --> sentence_len:", len(new_sentence.split()), "!= tags_len:", len(new_tags), "w\\", new_sentence.split(), 'and', new_tags)
      return sentence, tags

    new_tags = ','.join(new_tags)
    return new_sentence, new_tags

  return new_sentence

In [None]:
"""
  tags=[], update_tags=False are optional since the function could be called during the prediction-phase
  During the train-phase, just specify the arguments to update the tags associated to the sentence

"""
def homogenization_the_following(sentence, tags='', update_tags=False):  # RIMUOVI TOKENS DI PAROLE RIMOSSE
    tags_to_change_list = []
    new_sentence = sentence

    for element in the_following_list:
      if element in new_sentence:
          new_sentence.strip()
          s_split = new_sentence.split(element)
          if len(s_split) > 1 and len(s_split[1]) > 0 and s_split[1][0] == ':':
            s_split[1] = s_split[1][1:]
          new_sentence = ''.join(s_split)

          if update_tags:
            sentence_split = sentence.split()
            len_sentence_split = len(sentence_split)
            len_matched_element = len(element.split())  # Count number of words in the keyword
            re_element = re.escape(element)

            for idx, _ in enumerate(sentence_split):  # Search the matched keyword in the original sentence and retrieve indexes
              if idx + len_matched_element > len_sentence_split:
                break

              if re.search(re_element, ' '.join(sentence_split[idx:idx+len_matched_element]), re.IGNORECASE):
                tags_to_change_list.append({  # Save indexes of the keyword to update tags
                  'word_to_change': element,
                  'tags-idx': [i for i in range(idx, idx+len_matched_element)]
                })

    new_sentence = re.sub(r'\s+', ' ', new_sentence)

    if update_tags:
      tags_to_change_list = sorted(tags_to_change_list, key=lambda x: x['tags-idx'][0])  # Sort indexes from the first appearance to the last in the sentence
      tags_split = tags.split(',')

      new_tags = []
      idx = 0
      while len(tags_to_change_list) > 0:  # Update tags
        dict_word_to_change = tags_to_change_list[0]
        word_to_change = dict_word_to_change['word_to_change']
        tags_idx = dict_word_to_change['tags-idx']

        while idx not in tags_idx:
          new_tags.append(tags_split[idx])
          idx+=1

        idx = tags_idx[-1] + 1

        tags_to_change_list.pop(0)

        while len(tags_to_change_list) > 0 and tags_idx[-1] >= tags_to_change_list[0]['tags-idx'][0]:  # Check if there are more matches in the some words and skip them
            tags_to_change_list.pop(0)

      while idx < len(tags_split):  # Copy remaining tags
        new_tags.append(tags_split[idx])
        idx += 1

      if len(new_tags) != len(new_sentence.split()):
        #print("\nERROR DIFFERENT LENGHT AT THE_FOLLOWING STEP --> sentence_len:", len(new_sentence.split()), "!= tags_len:", len(new_tags), "w\\", new_sentence.split(), 'and', new_tags)
        return sentence, tags

      new_tags = ','.join(new_tags)
      return new_sentence, new_tags

    #print(tags_to_change_list)
    return new_sentence

In [None]:
def get_verbs_in_sentence(sentence):
  #try:
  s_split = sentence.split()
  s = ' '.join(s_split[:150])
  predictions = predictor.predict(s.lower())
  """except:
    #print('errors while predicting verbs in sentence'.upper(), sentence)
    return []"""

  verb = []
  verbs_list = []
  for k in predictions['verbs']:
    verb.append(k['verb'])
    if k['description'].count('[') > 1:
      verbs_list.append(' '.join(verb))
      verb = []
  return verbs_list

In [None]:
def get_verbs_in_sentences(sentences):
  sent_dict_list = []
  for s in sentences:
    s_split = s.split()
    s = ' '.join(s_split[:150])
    sent_dict_list.append({'sentence': s})

  list_emergency = []
  try:
    predictions = predictor.predict_batch_json(sent_dict_list)
  except:
    print()
    for sentence in sentences:
      try:
        s_split = sentence.split()
        s = ' '.join(s_split[:150])
        prediction = predictor.predict(s.lower())
        list_emergency.append(prediction)
      except:
        #print('errors while predicting verbs in sentence'.upper(), sentence)
        list_emergency.append({'verbs': []})

  if len(list_emergency) > 0:
    predictions = list_emergency

  total_verbs_list = []
  for prediction in predictions:
    verb = []
    verbs_list = []
    for k in prediction['verbs']:
      verb.append(k['verb'])
      if k['description'].count('[') > 1:
        verbs_list.append(' '.join(verb))
        verb = []
    total_verbs_list.append(verbs_list)

  return total_verbs_list

In [None]:
"""
Takes the verb, the sentence, and from which idx starting to search
Returns the list of idx of the words composing the verb, if it's a negative form and if there was any error in searching

"""

def get_first_index_from(word, sentence, start_from_idx):
  sentence_split = sentence.split()
  sentence_len = len(sentence_split)

  word_split_len = len(word.split())
  idx = start_from_idx
  negative_form = 0

  error = 0

  if word_split_len == 1:  # Verb with single word
    while idx < sentence_len:
      current_split = sentence_split[idx].lower()  # Get word of sentence_split at idx
      if word in current_split:  # Verify if it's the verb
        if word+'n\'t' in current_split or word+'nt' in current_split:  # Verify if it's negative
          negative_form = 1
        return [idx], negative_form, error
      idx += 1

  else:  # Verb with more words
    first_word = word.split()[0]
    remaining_words = ' '.join(word.split()[1:])
    indexes_list = []

    while idx < sentence_len:
      current_split = sentence_split[idx].lower()  # Get word of sentence_split at idx
      if first_word in current_split:  # Verify if it's the verb
        if first_word+'n\'t' in current_split or first_word+'nt' in current_split:  # Verify if it's negative
          negative_form = 1
        start_idx_remaining_words = idx+1
        while start_idx_remaining_words < sentence_len:  # When the first word of the verb is found, search for the remaining parts (they may be not be tied to the first term)
          if remaining_words in ' '.join(sentence_split[start_idx_remaining_words:(start_idx_remaining_words + word_split_len-1)]).lower():
            indexes_list.append(idx)
            for i in range(start_idx_remaining_words, start_idx_remaining_words + word_split_len-1):
              indexes_list.append(i)
            return indexes_list, negative_form, error
          start_idx_remaining_words += 1
      idx += 1

  error = 1
  return [], -1, error

In [None]:
def standardize_verb(verb):
  for v_list in verbs_lists_homogenization:
    if verb in v_list:
      return v_list[0]
  return verb

In [None]:
"""
Substitute the verb with its lemmatize form, taking the dictionary with the information about the verb and the sentence
Returns the new sentence and the index of the last modification

"""

def apply_lemmatization(substitutions, sentence):
  sentence_split = sentence.split()
  new_sentence_split = []
  new_sentence_idx_last_modification = 0

  verb = substitutions['verb']
  lemm = standardize_verb(substitutions['lemm'])
  tags_idx = substitutions['tags-idx']
  tag_len = substitutions['tag-len']
  negative_form = substitutions['negative-form']

  last_verb_word = tags_idx[-1]

  for idx, word in enumerate(sentence_split):
    if idx not in tags_idx:  # Not a verb to remove
      new_sentence_split.append(word)  # Keep it in the phrase
    elif idx == last_verb_word:  # Last part of the verb
      new_sentence_split.append(lemm)  # Keep the lemmatized form
      new_sentence_idx_last_modification = len(new_sentence_split)-1  # Update index
      if negative_form:  # If negative form, add a not before the verb
        new_sentence_split.insert(new_sentence_idx_last_modification, 'not')
        new_sentence_idx_last_modification += 1  # Update index

  return ' '.join(new_sentence_split), new_sentence_idx_last_modification

In [None]:
def adjust_tags(substitution, tags):
  tags_idx = substitution['tags-idx']
  tag_len = substitution['tag-len']
  negative_form = substitution['negative-form']

  tags_split = tags.split(',')

  tags_to_check = []
  for tag_idx in tags_idx:
    tags_to_check.append(tags_split[tag_idx])
  final_tag = get_major_tag(tags_to_check)

  new_tags_split = []
  last_verb_tag = tags_idx[-1]

  for idx, tag in enumerate(tags_split):
    if idx not in tags_idx:  # Not a verb to remove
      new_tags_split.append(tag)  # Keep it in the phrase
    elif idx == last_verb_tag:  # Last part of the verb
      new_tags_split.append(final_tag)  # Keep the lemmatized form
      new_tag_idx_last_modification = len(new_tags_split)-1  # Update index
      if negative_form:  # If negative form, add a not before the verb
        new_tags_split.insert(new_tag_idx_last_modification, 'O')
  return ','.join(new_tags_split)

In [None]:
def homogenization_verbs(sentence, tags='', update_tags=False):
  new_sentence = sentence
  new_tags = copy.deepcopy(tags)
  verbs_list_in_sentence = get_verbs_in_sentence(new_sentence)

  # Search for lemmatizing
  start_idx = -1
  for verb in verbs_list_in_sentence:
    substitutions = []
    lemm = WordNetLemmatizer().lemmatize(verb.split()[-1].lower(), 'v')

    tags_idx, negative_form, error = get_first_index_from(verb, new_sentence, start_idx+1)
    if error:
      #print('\nERR W\\ "', verb, '" IN THE SENTENCE:', sentence)
      continue

    substitution = {
        'verb': verb,
        'lemm': lemm,
        'tags-idx': tags_idx,
        'tag-len': len(verb.split()),
        'negative-form': negative_form
    }

    new_sentence, start_idx = apply_lemmatization(substitution, new_sentence)

    if update_tags:
      new_tags = adjust_tags(substitution, new_tags)

  if update_tags:
    if len(new_sentence.split()) != len(new_tags.split(',')):
      #print("\nERROR DIFFERENT LENGHT AT THE_FOLLOWING STEP --> sentence_len:", len(new_sentence.split()), "!= tags_len:", len(new_tags.split(',')), "w\\", new_sentence.split(), 'and', new_tags.split(','))
      return sentence, tags

    return new_sentence, new_tags

  return new_sentence

In [None]:
def homogenization_verbs_post(sentences, tags_l='', update_tags=False):
  total_sentences = []
  total_tags = []

  new_sentences = copy.deepcopy(sentences)
  verbs_list_in_sentences = get_verbs_in_sentences(new_sentences)

  if not update_tags:
    for sentence, verbs_list_in_sentence in zip(new_sentences, verbs_list_in_sentences):
      new_sentence = sentence

      # Search for lemmatizing
      start_idx = -1
      for verb in verbs_list_in_sentence:
        substitutions = []
        lemm = WordNetLemmatizer().lemmatize(verb.split()[-1].lower(), 'v')

        tags_idx, negative_form, error = get_first_index_from(verb, new_sentence, start_idx+1)
        if error:
          #print('\nERR W\\ "', verb, '" IN THE SENTENCE:', sentence)
          continue

        substitution = {
            'verb': verb,
            'lemm': lemm,
            'tags-idx': tags_idx,
            'tag-len': len(verb.split()),
            'negative-form': negative_form
        }

        new_sentence, start_idx = apply_lemmatization(substitution, new_sentence)

      total_sentences.append(new_sentence)
    return total_sentences


  new_tags_list = copy.deepcopy(tags_l)

  for sentence, verbs_list_in_sentence, tags in zip(new_sentences, verbs_list_in_sentences, new_tags_list):
    new_sentence = sentence
    new_tags = copy.deepcopy(tags)

    # Search for lemmatizing
    start_idx = -1
    for verb in verbs_list_in_sentence:
      substitutions = []
      lemm = WordNetLemmatizer().lemmatize(verb.split()[-1].lower(), 'v')

      tags_idx, negative_form, error = get_first_index_from(verb, new_sentence, start_idx+1)
      if error:
        #print('\nERR W\\ "', verb, '" IN THE SENTENCE:', sentence)
        continue

      substitution = {
          'verb': verb,
          'lemm': lemm,
          'tags-idx': tags_idx,
          'tag-len': len(verb.split()),
          'negative-form': negative_form
      }

      new_sentence, start_idx = apply_lemmatization(substitution, new_sentence)

      new_tags = adjust_tags(substitution, new_tags)

    if len(new_sentence.split()) != len(new_tags.split(',')):
      #print("\nERROR DIFFERENT LENGHT AT THE_FOLLOWING STEP --> sentence_len:", len(new_sentence.split()), "!= tags_len:", len(new_tags.split(',')), "w\\", new_sentence.split(), 'and', new_tags.split(','))
      total_sentences.append(sentence)
      total_tags.append(tags)
      continue

    total_sentences.append(new_sentence)
    total_tags.append(new_tags)

  return total_sentences, total_tags

In [None]:
def homogenization_steps(sentence, tags='', update_tags=False):
  if update_tags:
    sentence, tags = homogenization_noun(sentence, tags=tags, update_tags=True)
    sentence, tags = homogenization_try_going_to(sentence, tags=tags, update_tags=True)
    sentence, tags = homogenization_is_capable_of(sentence, tags=tags, update_tags=True)
    sentence, tags = homogenization_modification(sentence, tags=tags, update_tags=True)
    sentence, tags = homogenization_the_following(sentence, tags=tags, update_tags=True)
    sentence, tags = homogenization_verbs(sentence, tags=tags, update_tags=True)
    return sentence, tags

  sentence = homogenization_noun(sentence)
  sentence = homogenization_try_going_to(sentence)
  sentence = homogenization_is_capable_of(sentence)
  sentence = homogenization_modification(sentence)
  sentence = homogenization_the_following(sentence)
  sentence = homogenization_verbs(sentence)
  return sentence

In [None]:
def homogenization_steps_post(sentences, tags_l='', update_tags=False):
  total_sentences = []
  total_tags = []

  if update_tags:
    for sentence, tags in zip(sentences, tags_l):
      sentence, tags = homogenization_noun(sentence, tags=tags, update_tags=True)
      sentence, tags = homogenization_try_going_to(sentence, tags=tags, update_tags=True)
      sentence, tags = homogenization_is_capable_of(sentence, tags=tags, update_tags=True)
      sentence, tags = homogenization_modification(sentence, tags=tags, update_tags=True)
      sentence, tags = homogenization_the_following(sentence, tags=tags, update_tags=True)
      #sentence, tags = homogenization_verbs(sentence, tags=tags, update_tags=True)
      total_sentences.append(sentence)
      total_tags.append(tags)

    return homogenization_verbs_post(total_sentences, tags_l=total_tags, update_tags=True)

  for sentence in sentences:
    sentence = homogenization_noun(sentence)
    sentence = homogenization_try_going_to(sentence)
    sentence = homogenization_is_capable_of(sentence)
    sentence = homogenization_modification(sentence)
    sentence = homogenization_the_following(sentence)
    #sentence = homogenization_verbs(sentence)
    total_sentences.append(sentence)
  return homogenization_verbs_post(total_sentences)

#### <font color='yellow'>Name Resolution</font>

##### Pronouns and Subject ellipsis resolution

In [None]:
def remove_punctuation_from_words(sentence):
  s_split = sentence.lower().split()

  new_sent = []
  for w in s_split:
    new_w = ''.join(char for char in w if char not in string.punctuation)
    if new_w == '':
      new_w = w
    new_sent.append(new_w)

  return new_sent

def ellipsis_pronouns(sentence, tags='', update_tags=False):
    doc = ellipsis_nlp(sentence)
    new_sentence = doc._.coref_resolved.replace("\n", "")
    #print('\nSENTENCE', sentence)
    #print('\nNEW SENTENCE', new_sentence)
    #print(tags.split(','))
    #print(sentence.split())
    #print(new_sentence.split())
    #print()

    if doc._.has_coref:
      list_mentions = []
      for cluster in doc._.coref_clusters:
          list_mentions.append(remove_punctuation_from_words(str(cluster.mentions[0])))  # Original noun
      #print(list_mentions)

      if update_tags:
        sentence_split = remove_punctuation_from_words(sentence)
        new_sentence_split = remove_punctuation_from_words(new_sentence)
        tags_split = tags.split(',')

        idx_sentence = 0
        idx_new_sentence = 0
        len_sentence = len(sentence_split)
        len_new_sentence = len(new_sentence_split)

        changed_words = []
        changed_idx = []
        changes_to_check = []

        while idx_sentence < len_sentence and idx_new_sentence < len_new_sentence:
          #print('idx_sentence:', idx_sentence, '- idx_new_sentence:', idx_new_sentence)
          #print('sentence_split[idx_sentence] != new_sentence_split[idx_new_sentence] ?', sentence_split[idx_sentence], new_sentence_split[idx_new_sentence], sentence_split[idx_sentence] != new_sentence_split[idx_new_sentence])
          if sentence_split[idx_sentence] != new_sentence_split[idx_new_sentence]:
            changed_words.append(new_sentence_split[idx_new_sentence])
            changed_idx.append(idx_sentence)
            idx_sentence += 1
            idx_new_sentence += 1
            temp_idx_new_sentence = idx_new_sentence

            if idx_sentence < len_sentence:
              while idx_new_sentence < len_new_sentence:
                if sentence_split[idx_sentence] != new_sentence_split[idx_new_sentence]:
                  changed_words.append(new_sentence_split[idx_new_sentence])
                  idx_new_sentence += 1
                else:
                  changes_to_check.append({
                      'changed_words_sentence': changed_words,
                      'changed_idx_sentence': changed_idx,
                      'final_tags': []
                  })
                  changed_words = []
                  changed_idx = []
                  break

              #### new block for special cases
              if idx_new_sentence >= len_new_sentence: #problem w\ 2 pronouns one after the other
                for el in list_mentions:
                  len_el = len(el)
                  if changed_words[:len_el] == el:
                    break

                changes_to_check.append({
                      'changed_words_sentence': changed_words[:len_el],
                      'changed_idx_sentence': changed_idx,
                      'final_tags': []
                  })
                idx_new_sentence = temp_idx_new_sentence + len_el-1

                if idx_new_sentence >= len_new_sentence:
                  #print('ERROR IN SUBSTITUTIONS FOR SENTENCE', sentence)
                  return sentence, tags

                changed_words = []
                changed_words.append(new_sentence_split[idx_new_sentence])
                changed_idx = []
                changed_idx.append(idx_sentence)
                idx_sentence += 1
                idx_new_sentence += 1


                if idx_sentence < len_sentence:
                  while idx_new_sentence < len_new_sentence:
                    #print('idx_sentence:', idx_sentence, '- idx_new_sentence:', idx_new_sentence)
                    #print('sentence_split[idx_sentence] != new_sentence_split[idx_new_sentence] ?', sentence_split[idx_sentence], new_sentence_split[idx_new_sentence], sentence_split[idx_sentence] != new_sentence_split[idx_new_sentence])
                    if sentence_split[idx_sentence] != new_sentence_split[idx_new_sentence]:
                      changed_words.append(new_sentence_split[idx_new_sentence])
                      idx_new_sentence += 1
                    else:
                      changes_to_check.append({
                          'changed_words_sentence': changed_words,
                          'changed_idx_sentence': changed_idx,
                          'final_tags': []
                      })
                      changed_words = []
                      changed_idx = []
                      break

                else:
                  while idx_new_sentence < len_new_sentence:
                      changed_words.append(new_sentence_split[idx_new_sentence])
                      idx_new_sentence += 1
                  changes_to_check.append({
                      'changed_words_sentence': changed_words,
                      'changed_idx_sentence': changed_idx,
                      'final_tags': []
                  })
                  changed_words = []
                  changed_idx = []
              ####

            else:
              while idx_new_sentence < len_new_sentence:
                  changed_words.append(new_sentence_split[idx_new_sentence])
                  idx_new_sentence += 1
              changes_to_check.append({
                  'changed_words_sentence': changed_words,
                  'changed_idx_sentence': changed_idx,
                  'final_tags': []
              })
              changed_words = []
              changed_idx = []

          idx_sentence += 1
          idx_new_sentence += 1

        idx_sentence = 0
        idx_changes = 0
        len_changes_to_check = len(changes_to_check)
        while idx_changes < len_changes_to_check:
          current_change = changes_to_check[idx_changes]
          len_changed_words = len(current_change['changed_words_sentence'])
          idx_sentence = current_change['changed_idx_sentence'][0]-len_changed_words
          while idx_sentence >= 0:
            if sentence_split[idx_sentence:idx_sentence+len_changed_words] == current_change['changed_words_sentence']:
              current_change['final_tags'] = tags_split[idx_sentence:idx_sentence+len_changed_words]
              idx_changes += 1
              break
            idx_sentence -= 1
          if idx_sentence <= -1:
            idx_changes += 1

        """for el in changes_to_check:
          print(el)"""

        new_tags = []
        idx = 0
        while len(changes_to_check) > 0:  # Update tags
          changed_idx_sentence = changes_to_check[0]['changed_idx_sentence']
          final_tags = changes_to_check[0]['final_tags']

          while idx not in changed_idx_sentence:
            new_tags.append(tags_split[idx])
            idx+=1

          new_tags.extend(final_tags)
          idx = changed_idx_sentence[-1] + 1

          changes_to_check.pop(0)

        while idx < len_sentence:  # Copy remaining tags
          new_tags.append(tags_split[idx])
          idx += 1


        if len(new_tags) != len(new_sentence.split()):
          #print("\nERROR DIFFERENT LENGHT AT PRONOUNS STEP --> sentence_len:", len(new_sentence.split()), "!= tags_len:", len(new_tags), "w\\\n", new_sentence.split(), 'and\n', new_tags)
          return sentence, tags


        new_tags = ','.join(new_tags)
        return new_sentence, new_tags

      return new_sentence

    else:
      if update_tags:
        return sentence, tags
      return sentence

In [None]:
"""
Search subject of sentence starting from the last one
"""
def detect_subj(sentence_list):
    subject = ''
    for idx, sentence in enumerate(reversed(sentence_list)):
        idx_sent = len(sentence_list) - 1 - idx
        doc = ellipsis_nlp(sentence)
        for token in doc:
            if token.dep_ == "nsubj":
                subject = token.text
        if subject:
          #print(f'Subject found at sentence # {idx_sent}: {sentence}')
          return subject, idx_sent

    #print(f'No Subject IT -1')
    return 'It', -1 # If no subject is detected

def capitalize(line):
    return line[0].upper() + line[1:]

"""
Add subject to sentences that starts with a verb
It can be found in previous sentences or simply add a 'It'
"""
def ellipsis_subject(sentence_list, search_subj_in_previous_text=False, tags_list=[], update_tags=False):
  new_sentence_list = []
  new_tags_list = []

  for idx, sentence in enumerate(sentence_list):  # For each sentence
    if sentence == '' or sentence == ' ':
      new_sentence_list.append('')
      if update_tags:
        new_tags_list.append('')
      continue

    # Split it into tokens and examine different cases (I didn't fully understand how the cases are structured but I trust Extractor authors)
    token = sentence.split()
    doc = ellipsis_nlp(sentence)

    # Case 1
    if nltk.pos_tag(token)[0][1] == "VB" or nltk.pos_tag(token)[0][1] == "VBZ" or doc[0].pos_ == "VERB" or doc[0].text.lower() in ellipsis_verbs:
      if search_subj_in_previous_text:  # Search text in previous sentences
        subject, idx_sent = detect_subj(new_sentence_list)  # Retrieve subject and idx of the sentence
        new_sentence = capitalize(subject) + " " + nltk.pos_tag(token)[0][0].lower() + " " + " ".join(sentence.split(" ")[1:])  # Add subject to sentence

        if update_tags:  # Also update tags
          if idx_sent == -1:  # Not subject found
            #print('CASE 1 No subject found')
            new_tags = 'O,' + tags_list[idx]
          else:
            #print(f'CASE 1 Subject found at sentence # {idx_sent}: {new_sentence_list[idx_sent]} with tags {new_tags_list[idx_sent]}')
            #print(f'The subject is "{subject}"')
            tags_split = new_tags_list[idx_sent].split(',')
            sentence_split = new_sentence_list[idx_sent].split()
            pattern = re.compile(rf'\b(?:{re.escape(subject)})\b', re.IGNORECASE)
            for tok, tag in zip(reversed(sentence_split), reversed(tags_split)):
              if pattern.search(tok):  # Search for the word used as subject and add its tag to the new sentence's tags
                new_tags = tag + ',' + tags_list[idx]
                break
      else:  # Do not search text in previous sentences - Simply add 'it'
        new_sentence = " It " + nltk.pos_tag(token)[0][0].lower() + " " + " ".join(sentence.split(" ")[1:])
        if update_tags:
          new_tags = 'O,' + tags_list[idx]

    # Case 2
    elif doc[0].dep_ == "ROOT":
        # Subcase 2.1
        if doc[0].text.lower() in ellipsis_verbs:  # Didn't get the division of cases but ok
          if search_subj_in_previous_text:  # Search text in previous sentences
            subject, idx_sent = detect_subj(new_sentence_list)  # Retrieve subject and idx of the sentence
            new_sentence = capitalize(subject) + " " + doc[0].text.lower() + " " + " ".join(sentence.split(" ")[1:])  # Add subject to sentence

            if update_tags:  # Also update tags
              if idx_sent == -1:  # Not subject found
                #print('CASE 2 No subject found')
                new_tags = 'O,' + tags_list[idx]
              else:
                #print(f'CASE 2 Subject found at sentence # {idx_sent}: {new_sentence_list[idx_sent]} with tags {new_tags_list[idx_sent]}')
                #print(f'The subject is "{subject}"')
                tags_split = new_tags_list[idx_sent].split(',')
                sentence_split = new_sentence_list[idx_sent].split()
                pattern = re.compile(rf'\b(?:{re.escape(subject)})\b', re.IGNORECASE)
                for tok, tag in zip(reversed(sentence_split), reversed(tags_split)):
                  if pattern.search(tok):  # Search for the word used as subject and add its tag to the new sentence's tags
                    new_tags = tag + ',' + tags_list[idx]
                    break

          else:  # Do not search text in previous sentences - Simply add 'it'
            new_sentence = " It " + doc[0].text.lower() + " " + " ".join(sentence.split(" ")[1:])
            if update_tags:
              new_tags = 'O,' + tags_list[idx]

        # Subcase 2.2
        else:
          new_sentence = sentence
          if update_tags:
            new_tags = tags_list[idx]

    # Case 3
    elif doc[0].text.lower() in ellipsis_verbs and doc[0].dep_ != "ROOT":
        new_sentence = doc.text
        if update_tags:
          new_tags = tags_list[idx]

    # Case 4
    else:
        new_sentence = sentence
        if update_tags:
          new_tags = tags_list[idx]

    if update_tags and len(new_sentence.strip().split()) != len(new_tags.split(',')):
      #print("\nERROR DIFFERENT LENGHT AT SUBJECT STEP --> sentence_len:", len(new_sentence.strip().split()), "!= tags_len:", len(new_tags.split(',')), "w\\ SENTENCE", sentence, 'and', new_sentence.strip().split(), 'and', new_tags.split(','))
      new_sentence_list.append(sentence_list[idx])
      new_tags_list.append(tags_list[idx])
      continue

    new_sentence_list.append(new_sentence.strip())
    if update_tags:
      new_tags_list.append(new_tags)

  if update_tags:
    return new_sentence_list, new_tags_list

  return new_sentence_list

##### Stopwords Removal

In [None]:
"""
  tags=[], update_tags=False are optional since the function could be called during the prediction-phase
  During the train-phase, just specify the arguments to update the tags associated to the sentence

"""
def remove_stopwords(sentence, tags='', update_tags=False):
  filtered_sentence = []
  filtered_tags = []
  sentence_tokenized = sentence.split()
  tags_tokenized = tags.split(',')

  # Train-phase case
  if update_tags:
    #print('Train-phase')
    """if len(sentence_tokenized) != len(tags_tokenized):
      print('ERROR: different lenght tokens and tags')
      print(sentence_tokenized)
      print(tags_tokenized)
      return None, None"""

    for word, tag in zip(sentence_tokenized, tags_tokenized):
      if word.lower() not in stop_words:
        filtered_sentence.append(word)
        filtered_tags.append(tag)

    if len(filtered_sentence) != len(filtered_tags):
      #print("\nERROR DIFFERENT LENGHT --> sentence_len:", len(filtered_sentence), "!= tags_len:", len(filtered_tags), "w\\", filtered_sentence, 'and', filtered_tags)
      return sentence, tags

    return ' '.join(filtered_sentence), ','.join(filtered_tags)

  # Test-phase case
  #print('Prediction-phase')
  for word in sentence_tokenized:
    if word.lower() not in stop_words:
      filtered_sentence.append(word)

  return ' '.join(filtered_sentence)

##### Internet Slang Removal

In [None]:
"""
  tags=[], update_tags=False are optional since the function could be called during the prediction-phase
  During the train-phase, just specify the arguments to update the tags associated to the sentence

"""
def remove_internet_slangs(sentence, tags='', update_tags=False):
  filtered_sentence = []
  filtered_tags = []
  sentence_tokenized = sentence.split()
  tags_tokenized = tags.split(',')

  # Train-phase case
  if update_tags:
    #print('Train-phase')
    """if len(sentence_tokenized) != len(tags_tokenized):
      print('ERROR: different lenght tokens and tags')
      print(sentence_tokenized)
      print(tags_tokenized)
      return None, None"""

    for word, tag in zip(sentence_tokenized, tags_tokenized):
      if word.lower() not in internet_slang_words:
        filtered_sentence.append(word)
        filtered_tags.append(tag)

    if len(filtered_sentence) != len(filtered_tags):
      #print("\nERROR DIFFERENT LENGHT --> sentence_len:", len(filtered_sentence), "!= tags_len:", len(filtered_tags), "w\\", filtered_sentence, 'and', filtered_tags)
      #print("\nERROR DIFFERENT LENGHT w\\", filtered_sentence, 'and', filtered_tags)
      return sentence, tags

    return ' '.join(filtered_sentence), ','.join(filtered_tags)

  # Test-phase case
  #print('Prediction-phase')
  for word in sentence_tokenized:
    if word.lower() not in internet_slang_words:
      filtered_sentence.append(word)

  return ' '.join(filtered_sentence)

##### Aliases Handling

In [None]:
def get_actor_groups_regex_lists():
  actor_groups_regex_lists = []

  for alias_list in actor_groups_aliases:
    pattern = r'\b(?:' + '|'.join(map(re.escape, alias_list)) + r')\b'
    actor_groups_regex_lists.append(re.compile(pattern, re.IGNORECASE))

  return actor_groups_regex_lists

actor_groups_regex_lists = get_actor_groups_regex_lists()


def get_malware_families_regex_lists():
  malware_families_regex_lists = []

  for alias_list in malware_families_aliases:
    pattern = r'\b(?:' + '|'.join(map(re.escape, alias_list)) + r')\b'
    malware_families_regex_lists.append(re.compile(pattern, re.IGNORECASE))

  return malware_families_regex_lists

malware_families_regex_lists = get_malware_families_regex_lists()

In [None]:
"""
  tags=[], update_tags=False are optional since the function could be called during the prediction-phase
  During the train-phase, just specify the arguments to update the tags associated to the sentence

"""
def handle_actor_groups_aliases(sentence, tags='', update_tags=False):
  if sentence == '':
    if update_tags:
      return sentence, tags, []
    return sentence, []

  new_sentence = sentence

  sentence_split = sentence.split()
  len_sentence_split = len(sentence_split)

  tags_split = tags.split(',')
  len_tags_split = len(tags_split)

  tags_to_change_list = []
  alias_pair_list = []

  for var, big_regex in zip(actor_groups_aliases, actor_groups_regex_lists):
    new_sentence = big_regex.sub(var[0], new_sentence)  # Apply regex

    matched_words = []
    matched_words.extend(re.findall(big_regex, sentence))  # Save all the matches to change entity tags

    for matched_word in set(matched_words):
      len_matched_word = len(matched_word.split())  # Count number of words in the keyword

      for idx, _ in enumerate(sentence_split):  # Search the matched keyword in the original sentence and retrieve indexes
        if idx + len_matched_word > len_sentence_split:
          break

        pattern = r'\b{}\b'.format(re.escape(matched_word))

        if re.search(pattern, ' '.join(sentence_split[idx:idx+len_matched_word]), re.IGNORECASE):
          final_tags = ['B-APT']
          for i in range(len(var[0].split())-1):
            final_tags.append('I-APT')
          tags_to_change_list.append({  # Save indexes of the keyword to update tags
            'word_to_change': matched_word,
            'tags-idx': [i for i in range(idx, idx+len_matched_word)],
            'final-tag': final_tags,
            'final-word': var[0]
          })

  tags_to_change_list = sorted(tags_to_change_list, key=lambda x: x['tags-idx'][0])  # Sort indexes from the first appearance to the last in the sentence

  for el in tags_to_change_list:
    word_to_change = el['word_to_change']
    final_word = el['final-word']
    tags_idx = el['tags-idx']
    alias_pair_list.append({
        'alias_name':final_word,
        'original_name':word_to_change
    })

  if update_tags:
    new_tags = []
    idx = 0
    while len(tags_to_change_list) > 0:  # Update tags
      dict_word_to_change = tags_to_change_list[0]
      word_to_change = dict_word_to_change['word_to_change']
      tags_idx = dict_word_to_change['tags-idx']
      final_tag = dict_word_to_change['final-tag']

      while idx not in tags_idx:
        new_tags.append(tags_split[idx])
        idx+=1

      new_tags.extend(final_tag)
      idx = tags_idx[-1] + 1

      tags_to_change_list.pop(0)

      while len(tags_to_change_list) > 0 and tags_idx[-1] >= tags_to_change_list[0]['tags-idx'][0]:  # Check if there are more matches in the some words and skip them
          tags_to_change_list.pop(0)

    while idx < len(tags_split):  # Copy remaining tags
      new_tags.append(tags_split[idx])
      idx += 1

    if len(new_tags) != len(new_sentence.split()):
      print("\nERROR DIFFERENT LENGHT AT THREAT ACTORS STEP --> sentence_len:", len(new_sentence.split()), "!= tags_len:", len(new_tags), "w\\", new_sentence.split(), 'and', new_tags)
      return sentence, tags, []

    new_tags = ','.join(new_tags)
    return new_sentence, new_tags, alias_pair_list

  return new_sentence, alias_pair_list

In [None]:
"""
  tags=[], update_tags=False are optional since the function could be called during the prediction-phase
  During the train-phase, just specify the arguments to update the tags associated to the sentence

"""
def handle_malware_families_aliases(sentence, tags='', update_tags=False):
  if sentence == '':
    if update_tags:
      return sentence, tags, []
    return sentence, []

  new_sentence = sentence

  sentence_split = sentence.split()
  len_sentence_split = len(sentence_split)

  tags_split = tags.split(',')
  len_tags_split = len(tags_split)

  tags_to_change_list = []
  alias_pair_list = []

  for var, big_regex in zip(malware_families_aliases, malware_families_regex_lists):
    new_sentence = big_regex.sub(var[0], new_sentence)  # Apply regex

    matched_words = []
    matched_words.extend(re.findall(big_regex, sentence))  # Save all the matches to change entity tags

    for matched_word in set(matched_words):
      len_matched_word = len(matched_word.split())  # Count number of words in the keyword

      for idx, _ in enumerate(sentence_split):  # Search the matched keyword in the original sentence and retrieve indexes
        if idx + len_matched_word > len_sentence_split:
          break

        pattern = r'\b{}\b'.format(re.escape(matched_word))

        if re.search(pattern, ' '.join(sentence_split[idx:idx+len_matched_word]), re.IGNORECASE):
          final_tags = ['B-APT']
          for i in range(len(var[0].split())-1):
            final_tags.append('I-APT')
          tags_to_change_list.append({  # Save indexes of the keyword to update tags
            'word_to_change': matched_word,
            'tags-idx': [i for i in range(idx, idx+len_matched_word)],
            'final-tag': final_tags,
            'final-word': var[0]
          })

  tags_to_change_list = sorted(tags_to_change_list, key=lambda x: x['tags-idx'][0])  # Sort indexes from the first appearance to the last in the sentence

  for el in tags_to_change_list:
    word_to_change = el['word_to_change']
    final_word = el['final-word']
    tags_idx = el['tags-idx']
    alias_pair_list.append({
        'alias_name':final_word,
        'original_name':word_to_change
    })

  if update_tags:
    new_tags = []
    idx = 0
    while len(tags_to_change_list) > 0:  # Update tags
      dict_word_to_change = tags_to_change_list[0]
      word_to_change = dict_word_to_change['word_to_change']
      tags_idx = dict_word_to_change['tags-idx']
      final_tag = dict_word_to_change['final-tag']

      while idx not in tags_idx:
        new_tags.append(tags_split[idx])
        idx+=1

      new_tags.extend(final_tag)
      idx = tags_idx[-1] + 1

      tags_to_change_list.pop(0)

      while len(tags_to_change_list) > 0 and tags_idx[-1] >= tags_to_change_list[0]['tags-idx'][0]:  # Check if there are more matches in the some words and skip them
          tags_to_change_list.pop(0)

    while idx < len(tags_split):  # Copy remaining tags
      new_tags.append(tags_split[idx])
      idx += 1

    if len(new_tags) != len(new_sentence.split()):
      print("\nERROR DIFFERENT LENGHT AT THREAT NAMES STEP --> sentence_len:", len(new_sentence.split()), "!= tags_len:", len(new_tags), "w\\", new_sentence.split(), 'and', new_tags)
      return sentence, tags, []

    new_tags = ','.join(new_tags)
    return new_sentence, new_tags, alias_pair_list

  return new_sentence, alias_pair_list

In [None]:
def handle_aliases(sentence, tags='', update_tags=False):
  if update_tags:
    sentence, tags, actor_aliases = handle_actor_groups_aliases(sentence, tags=tags, update_tags=True)
    sentence, tags, malware_aliases = handle_malware_families_aliases(sentence, tags=tags, update_tags=True)
    return sentence, tags, actor_aliases, malware_aliases

  sentence, actor_aliases = handle_actor_groups_aliases(sentence)
  sentence, malware_aliases = handle_malware_families_aliases(sentence)
  return sentence, actor_aliases, malware_aliases

## <font color='orange'>Pipeline NLP</font>

### <font color='yellow'>Sanitization</font>

#### Unrelated content removal

In [None]:
def call_unrelatedContentRemoval(data, final_filename):
  if unrelated_content_removal:
    final_filename.append('unrelatedContentRemoval')

    temp_filename = 'temp_' + '_'.join(final_filename) + '.json'
    file_path = os.path.join(temp_file_path, temp_filename)

    if os.path.exists(file_path):
      with open(file_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
      print('Unrelated content removal: Step already completed --> Load data')
    else:
      for entry in tqdm(data, desc='Unrelated Content Removal'):
        for idx, sentence in enumerate(entry['content']):
          entry['content'][idx] = analyze_sentence(sentence)

      with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4)

      print(f'\nSaved in {file_path}')
  else:
    print('Step not executed')
  return data, final_filename

#### IOC Defanging

In [None]:
def call_iocDefanging(data, final_filename):
  if ioc_defanging:
    final_filename.append('iocDefanging')

    temp_filename = 'temp_' + '_'.join(final_filename) + '.json'
    file_path = os.path.join(temp_file_path, temp_filename)

    if os.path.exists(file_path):
      with open(file_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
      print('IOC Defanging: Step already completed --> Load data')
    else:
      for entry in tqdm(data, desc='IOC Defanging'):
        for idx, sentence in enumerate(entry['content']):
          entry['content'][idx] = defang_iocs_in_text(sentence)

      with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4)

      print(f'\nSaved in {file_path}')
  else:
    print('Step not executed')
  return data, final_filename

#### Misspelling Correction

In [None]:
def call_misspellingCorrection(data, final_filename):
  if misspelling_correction:
    final_filename.append('misspellingCorrection')

    temp_filename = 'temp_' + '_'.join(final_filename) + '.json'
    file_path = os.path.join(temp_file_path, temp_filename)

    if os.path.exists(file_path):
      with open(file_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
      print('Misspelling correction: Step already completed --> Load data')
    else:
      for entry in tqdm(data, desc='Misspelling correction'):
        for idx, sentence in enumerate(entry['content']):
          entry['content'][idx] = spell_check(sentence)

      with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4)

      print(f'\nSaved in {file_path}')
  else:
    print('Step not executed')
  return data, final_filename

### <font color='yellow'>Text Normalization</font>

#### Passive\Active Conversion

In [None]:
def call_passiveActiveConversion(data, final_filename):
  if passive_active_conversion:
    final_filename.append('passiveActiveConversion')

    temp_filename = 'temp_' + '_'.join(final_filename) + '.json'
    file_path = os.path.join(temp_file_path, temp_filename)

    if os.path.exists(file_path):
      with open(file_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
      print('Passive\\Active Conversion: Step already completed --> Load data')
    else:
      for entry in tqdm(data, desc='Passive\\Active Conversion'):
        for idx, sentence in enumerate(entry['content']):
          entry['content'][idx] = convert_active_to_passive(sentence)

      with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4)

      print(f'\nSaved in {file_path}')
  else:
    print('Step not executed')
  return data, final_filename

#### Synonym Homogenization

In [None]:
def call_synonymHomogenization(data, final_filename):
  if synonym_homogenization:
    final_filename.append('synonymHomogenization')

    temp_filename = 'temp_' + '_'.join(final_filename) + '.json'
    file_path = os.path.join('/content/drive/My Drive/Pipeline_new_data/Folder_datasets/Results/temp5', temp_filename)

    if os.path.exists(file_path):
      with open(file_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
      print('Synonym Homogenization: Step already completed --> Load data')
    else:
      i = 0
      for entry in tqdm(data, desc='Synonym Homogenization'):
        data[i]['content'] = homogenization_steps_post(entry['content'])
        i += 1

      with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4)

      print(f'\nSaved in {file_path}')
  else:
    print('Step not executed')
  return data, final_filename

### <font color='yellow'>Name Resolution</font>

#### Pronouns and Subject Ellipsis Resolution

In [None]:
def call_pronounsSubjectEllipsisResolution(data, final_filename):
  if pronouns_subject_ellipsis_resolution:
    final_filename.append('pronounsSubjectEllipsisResolution')

    temp_filename = 'temp_' + '_'.join(final_filename) + '.json'
    file_path = os.path.join(temp_file_path, temp_filename)

    if os.path.exists(file_path):
      with open(file_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
      print('Pronouns and Subject Ellipsis Resolution: Step already completed --> Load data')
    else:
      i = 0
      for entry in tqdm(data, desc='Pronouns and Subject Ellipsis Resolution'):
        print(f'Entry ID {entry["ID"]}')
        if entry['ID'] == 18370:
          i += 1
          continue
        for idx, sentence in enumerate(entry['content']):
          entry['content'][idx] = ellipsis_pronouns(sentence)
        data[i]['content'] = ellipsis_subject(entry['content'], search_subj_in_previous_text=search_subj_in_previous_text)
        i += 1

      with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4)

      print(f'\nSaved in {file_path}')
  else:
    print('Step not executed')
  return data, final_filename

#### Stopwords Removal

In [None]:
def call_stopwordsRemoval(data, final_filename):
  if stopwords_removal:
    final_filename.append('stopwordsRemoval')

    temp_filename = 'temp_' + '_'.join(final_filename) + '.json'
    file_path = os.path.join(temp_file_path, temp_filename)

    if os.path.exists(file_path):
      with open(file_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
      print('Stopwords Removal: Step already completed --> Load data')
    else:
      for entry in tqdm(data, desc='Stopwords Removal'):
        for idx, sentence in enumerate(entry['content']):
          entry['content'][idx] = remove_stopwords(sentence)

      with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4)

      print(f'\nSaved in {file_path}')
  else:
    print('Step not executed')
  return data, final_filename

#### Internet Slang Removal

In [None]:
def call_internetSlangRemoval(data, final_filename):
  if internet_slang_removal:
    final_filename.append('internetSlangRemoval')

    temp_filename = 'temp_' + '_'.join(final_filename) + '.json'
    file_path = os.path.join(temp_file_path, temp_filename)

    if os.path.exists(file_path):
      with open(file_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
      print('Internet Slang Removal: Step already completed --> Load data')
    else:
      for entry in tqdm(data, desc='Internet Slang Removal'):
        for idx, sentence in enumerate(entry['content']):
          entry['content'][idx] = remove_internet_slangs(sentence)

      with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4)

      print(f'\nSaved in {file_path}')
  else:
    print('Step not executed')
  return data, final_filename

#### Aliases Handling

In [None]:
def call_aliasesHandling(data, final_filename):
  if aliases_handling:
    final_filename.append('aliasesHandling')

    temp_filename = 'temp_' + '_'.join(final_filename) + '.json'
    file_path = os.path.join(temp_file_path, temp_filename)

    if os.path.exists(file_path):
      with open(file_path, 'r', encoding='utf-8') as json_file:
        data = json.load(json_file)
      print('Aliases Handling: Step already completed --> Load data')
    else:
      for entry in tqdm(data, desc='Aliases Handling'):
        aliases_list = []
        for idx, sentence in enumerate(entry['content']):
          entry['content'][idx], actor_aliases, malware_aliases = handle_aliases(sentence)
          aliases_list.append({'actor_aliases':actor_aliases, 'malware_aliases':malware_aliases})
        entry['aliases_list'] = aliases_list

      with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4)

      print(f'\nSaved in {file_path}')
  else:
    print('Step not executed')
  return data, final_filename

### <font color='lightblue'>Save result</font>

In [None]:
def call_save(data, final_filename):
  final_filename = '_'.join(final_filename) + '.json'
  file_path = os.path.join('/content/drive/My Drive/Steps NLP framework/Pipeline_new_data/Folder_datasets/Results', final_filename)

  with open(file_path, 'w', encoding='utf-8') as json_file:
      json.dump(data, json_file, indent=4)

  print(f'Saved in {file_path}')
  return data, final_filename

## Loop block

In [None]:
# Chunks to process
min_len = 1
stop_len = min_len + 40

In [None]:
source = 0  # 0:HF or 1:REP

# Choose steps to perform in the pipeline
unrelated_content_removal = True
ioc_defanging = True
misspelling_correction = False

passive_active_conversion = True
synonym_homogenization = True
if source == 0:
  synonym_homogenization = False

pronouns_subject_ellipsis_resolution = True
if pronouns_subject_ellipsis_resolution:
  search_subj_in_previous_text = True
stopwords_removal = True
internet_slang_removal = True
aliases_handling = True

In [None]:
temp_file_path = '/content/drive/My Drive/Steps NLP framework/Pipeline_new_data/Folder_datasets/Results/temp'

In [None]:
if source == 0:
  max_len = 9005
elif source == 1:
  max_len = 394

max_len = min(stop_len, max_len)
print(f'from {min_len} to {max_len-1}')

In [None]:
for i_dataset in range (min_len, max_len):
  print(f'{i_dataset}/{max_len-1}')
  final_filename = []

  if source == 0:
    path = f'/content/drive/My Drive/Steps NLP framework/Pipeline_new_data/Folder_datasets/HF_dataset_{i_dataset}.json'
    final_filename.append(f'HF_{i_dataset}')
  elif source == 1:
    path = f'/content/drive/My Drive/Steps NLP framework/Pipeline_new_data/Folder_datasets/REP_dataset_{i_dataset}.json'
    final_filename.append(f'REP_{i_dataset}')

  with open(path, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

  data, final_filename = call_unrelatedContentRemoval(data, final_filename)
  data, final_filename = call_iocDefanging(data, final_filename)
  data, final_filename = call_misspellingCorrection(data, final_filename)
  data, final_filename = call_passiveActiveConversion(data, final_filename)
  data, final_filename = call_synonymHomogenization(data, final_filename)
  data, final_filename = call_pronounsSubjectEllipsisResolution(data, final_filename)
  data, final_filename = call_stopwordsRemoval(data, final_filename)
  data, final_filename = call_internetSlangRemoval(data, final_filename)
  data, final_filename = call_aliasesHandling(data, final_filename)
  data, final_filename = call_save(data, final_filename)

  print('\n-----------------------------\n')