In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.4-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface-hub<1.0,>=0.30.0->transformers)
  Downloading hf_xet-1.1.5-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Downloading tra

In [6]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading click-8.2.1-py3-none-any.whl (102 kB)
Installing collected packages: click, nltk
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [nltk][32m1/2[0m [nltk]
[1A[2KSuccessfully installed click-8.2.1 nltk-3.9.1


In [4]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import pickle
import torch
import os

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

### Load TTPXHunter from Huggingface

In [8]:
# Load the model and tokenizer from the Hugging Face Hub
model = RobertaForSequenceClassification.from_pretrained("nanda-rani/TTPXHunter")
tokenizer = RobertaTokenizer.from_pretrained("nanda-rani/TTPXHunter")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Copy the model to the GPU.
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [9]:
def extract_ttp_from_sentences(sentences, threshold, label_dict, ttpid2name):
    """
    Extract TTP (Tactics, Techniques, and Procedures) based on a prediction threshold from the given sentences.

    Args:
    - sentences (list of str): List of sentences to extract TTP from.
    - threshold (float): Confidence threshold for accepting predictions.

    Returns:
    - unique_ttp_ids (list of int): Unique TTP IDs extracted from the sentences.
    - names_for_ttp_ids (list of str): Human-readable names corresponding to the TTP IDs.
    """
    predictions = []

    # Loop over sentences and perform inference
    for text in sentences:
        # Tokenize the input text
        inputs = tokenizer(text, padding=True, truncation=True, max_length=256, return_tensors="pt").to(device)

        # Perform inference without gradient tracking
        with torch.no_grad():
            outputs = model(**inputs)

        # Extract logits and compute probabilities
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        max_prob, predicted_class_indices = torch.max(probabilities, dim=1)

        # Filter predictions based on the confidence threshold
        predicted_labels = [
            model.config.id2label[class_idx.item()]
            for prob, class_idx in zip(max_prob, predicted_class_indices)
            if prob.item() > threshold
        ]

        predictions.extend(predicted_labels)

    # Map the predicted labels to integer labels
    mapped_labels = [int(label.split('_')[1]) for label in predictions]

    # Load the label-to-name dictionary
    with open(label_dict, 'rb') as file:
        label_dict = pickle.load(file)

    # Invert the dictionary to map integer labels to TTP names
    inverted_label_dict = {v: k for k, v in label_dict.items()}
    ttp_list = [inverted_label_dict[label] for label in mapped_labels]

    # Extract unique TTP IDs
    unique_ttp_ids = list(set(ttp_list))

    # Translate TTP IDs to their names
    names_for_ttp_ids = translate_ttp_ids_to_names(unique_ttp_ids, ttpid2name)

    return unique_ttp_ids, names_for_ttp_ids

def remove_consecutive_newlines(text):
    """
    Remove consecutive newlines from a string.

    Args:
    - text (str): Input string with potential consecutive newlines.

    Returns:
    - str: String with consecutive newlines reduced to single newlines.
    """
    cleaned_text = text[0]
    for char in text[1:]:
        if not (char == cleaned_text[-1] and cleaned_text[-1] == '\n'):
            cleaned_text += char
    return cleaned_text

def process_text_file_for_attack_patterns(file_name, threshold, label_dict, ttpid2name):
    """
    Read and process a text file to extract attack patterns using TTP extraction.

    Args:
    - file_name (str): Path to the input text file.
    - threshold (float): Confidence threshold for TTP extraction.

    Returns:
    - tuple: (unique TTP IDs, names corresponding to TTP IDs).
    """
    sentences = []

    # Read the text file
    with open(file_name, 'r', encoding='utf-8') as file:
        text = file.read()

    # Clean the text by removing consecutive newlines and tabs
    text = remove_consecutive_newlines(text)
    text = text.replace('\t', ' ').replace("\'", "'")

    # Tokenize sentences
    tokenized_sentences = nltk.sent_tokenize(text)

    # Split tokenized sentences by newlines and filter empty lines
    for sentence in tokenized_sentences:
        sentences += [line for line in sentence.split('\n') if len(line) > 0]

    # Extract TTP from the processed sentences
    return extract_ttp_from_sentences(sentences, threshold, label_dict, ttpid2name)

def translate_ttp_ids_to_names(ttp_ids, ttpid2name):
    """
    Translate TTP (Tactics, Techniques, and Procedures) IDs to human-readable names.

    Args:
    - ttp_ids (list of int): List of TTP IDs to translate.

    Returns:
    - list of str: Corresponding human-readable names for the TTP IDs.
    """
    # Load the TTP ID to name mapping from a file
    with open(ttpid2name, 'rb') as file:
        id_to_name_map = pickle.load(file)

    # Translate each TTP ID to its corresponding name
    ttp_names = [id_to_name_map[ttp_id] for ttp_id in ttp_ids if ttp_id in id_to_name_map]

    return ttp_names


In [14]:
label_dict = 'label_dict.pkl'
ttpid2name = 'ttp_id_name.pkl'
report = "SharpPanda_APT_Campaign_Expands_its_Arsenal_Targeting_G20_Nations.txt"
th = 0.644

ttps, ttp_names = process_text_file_for_attack_patterns(report, th, label_dict, ttpid2name)
print(len(ttps))

for i in range(len(ttps)):
  print(ttps[i], " - ", ttp_names[i])

2
T1195  -  Supply Chain Compromise
T1587  -  Develop Capabilities
