In [3]:
!pip install transformers



In [4]:
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

import nltk
nltk.download('punkt')

import pickle


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
model_dir = 'MODEL'

# Load a trained model and vocabulary that you have fine-tuned
model = BertForSequenceClassification.from_pretrained(model_dir)
tokenizer = BertTokenizer.from_pretrained(model_dir)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Copy the model to the GPU.
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [8]:
def remove_consec_newline(s):
    ret = s[0]
    for x in s[1:]:
        if not (x == ret[-1] and ret[-1]=='\n'):
            ret += x
    return ret

def ttp_id_name(ttp_list):
  with open(ttpid2name, 'rb') as handle:
    id_2_name = pickle.load(handle)
  # print(ttp_list)
  # Get the list of corresponding values
  ttp_names = [id_2_name[key] for key in ttp_list if key in id_2_name]

  return ttp_names


def map_ttp_id(mapped_labels):
  with open(label2ttpid, 'rb') as f:
        label_dict = pickle.load(f)

  # Invert the dictionary
  inverted_label_dict = {v: k for k, v in label_dict.items()}

  # Map integer labels to categorical names
  ttp_list = [inverted_label_dict[label] for label in mapped_labels]

  ttp_ids = list(set(ttp_list))

  ttp_names = ttp_id_name(ttp_ids)

  return ttp_ids, ttp_names

def extract_ttps(fname, model, tokenizer):

  predicted_tech_id=[]
  sentences=[]
  with open(fname, 'r', encoding='utf-8') as f:
      text = f.read()

  text = remove_consec_newline(text)
  text = text.replace('\t', ' ')
  text = text.replace("\'", "'")
  sents_nltk = nltk.sent_tokenize(text)
  sents = []
  for x in sents_nltk:
      sents += x.split('\n')
  for line in sents:
      if len(line) > 0:
        sentences.append(line)

  for txt_string in sentences:
    inputs = tokenizer(txt_string, return_tensors="pt")
    inputs.to(device)

    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_class_id = logits.argmax().item()
    if logits.amax().cpu() > 6.15:
      predicted_tech_id.append(predicted_class_id)

  ttp_list = map_ttp_id(predicted_tech_id)
  return ttp_list

In [9]:
label2ttpid = 'label_dict.pkl'
ttpid2name = 'ttp_id_name.pkl'
fname= "APT28_unit42_sofacy_uses_dealerschoice_target_european_government_agency.txt"
ttp_ids, ttp_names = extract_ttps(fname, model, tokenizer)
print("Extracted TTPs are:")
for i in range(len(ttp_ids)):
  print(ttp_ids[i], " - ", ttp_names[i])

sofacy uses dealerschoice to target european government agency
robert falcone by robert falcone
march 15, 2018 at 1:00 pm
category: unit 42 tags: dealerschoice, european government agency, sofacy summary back in october 2016, unit 42 published an initial analysis on a flash exploitation framework used by the sofacy threat group called dealerschoice. the attack consisted of microsoft word delivery documents that contained adobe flash objects capable of loading additional malicious flash objects embedded in the file or directly provided by a command and control server. sofacy continued to use dealerschoice throughout the fall of 2016, which we also documented in our december 2016 publication discussing sofacy s larger campaign. on march 12 and march 14, we observed the sofacy group carrying out an attack on a european government agency involving an updated variant of dealerschoice. the updated dealerschoice documents used a similar process to obtain a malicious flash object from a c2 ser

In [10]:
len(ttp_ids)

11