## Installing and importing libraries

In [None]:
!pip install -q transformers seqeval[gpu]

In [None]:
import os
import time
import json
import torch
import random
import zipfile
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from torch import cuda
from tabulate import tabulate
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from seqeval.metrics import classification_report
from transformers import RobertaTokenizerFast, RobertaForTokenClassification

In [None]:
# Set the default device to GPU
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

## Setup parameters and functions

In [None]:
# Create list of categories
category_tags = ["APT", "SECTEAM", "IDTY", "OS", "EMAIL", "LOC", "TIME", "IP", "DOM", "URL", "PROT", 
         "FILE", "TOOL", "MD5", "SHA1", "SHA2", "MAL", "ENCR", "VULNAME", "VULID", "ACT"]
limits = ["B", "I"]  # Annotation scheme
entity_tags = ["O"] + [limit + "-" + tag for tag in category_tags for limit in limits]

In [None]:
# Create dictionaries
labels_to_ids = {label: str(i) for i,label in enumerate(entity_tags)}  # Maps individual tags to indices
ids_to_labels = {str(i): label for i,label in enumerate(entity_tags)}  # Maps indices to individual tags

In [None]:
SEED_VAL = 42
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 15
LEARNING_RATE = 2e-05
MAX_GRAD_NORM = 10
model_type = "s2w-ai/DarkBERT"
access_token = ""
uncased = False

tokenizer = RobertaTokenizerFast.from_pretrained(model_type, add_prefix_space=True, token=access_token)

#### Dataset loading

In [None]:
# Define the function to create df
def create_dataframe_from_file(filename, uncased=False):
    with open(filename, 'r', encoding='utf-8') as file:
            data = json.load(file)
            
    entries = []
    for entry in data:
        if entry['sentence'] == "":
            continue
        if uncased:
            entries.append({'sentence': entry['sentence'].lower(), 'word_labels': entry['tags']})
        else:  # Cased
            entries.append({'sentence': entry['sentence'], 'word_labels': entry['tags']})
            

    # Create a DataFrame from the dictionary
    df = pd.DataFrame(entries)
    return df

In [None]:
def check_test_tags(data):
    # Initialize a dictionary to store the count of each type
    type_counts = {tag: 0 for tag in category_tags}

    # Iterate through each dictionary in the list
    for i, entry in data.iterrows():
        tags_list = entry["word_labels"].split(',')
        for tag in tags_list:
            if tag != 'O':
                # Split the tag into limit and entity
                limit, entity = tag.split('-')
                # Increment the count for the corresponding type
                type_counts[entity] += 1

    # Convert type_counts to two lists for tabulate (horizontal)
    table_data = [["Entity"] + list(type_counts.keys()), ["Count"] + list(type_counts.values())]

    # Print the table horizontally
    table = tabulate(table_data, tablefmt="plain")
    print(table)

In [None]:
# Define labels at wordpiece-level --> handle this by only train the model on the tag labels for the first word piece token of a word
class dataset(Dataset):  # Transforms examples of a dataframe to PyTorch tensors
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: get the sentence and word labels
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # RobertaTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_split_into_words=True,
                             return_offsets_mapping=True,
                             padding='max_length',
                             truncation=True,
                             max_length=self.max_len)
    
        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels]
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100  # 100 is the default ignore_index of PyTorch's CrossEntropyLoss
        
        tokenized_sentence = [tokenizer.tokenize(word) for word in sentence]
        tokens_len_list = []
        for toks in tokenized_sentence:
            for i in range(1, len(toks)+1):
                tokens_len_list.append(i)
        len_toks = len(tokens_len_list)        
        
        i=0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
            if not(mapping[0] == 0 and mapping[1] == 0):
                # overwrite label
                encoded_labels[idx] = labels[i]
                if idx >= len_toks or tokens_len_list[idx] == 1:  # Only if the next mapping/token belongs to a new word
                    i+=1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

    def __len__(self):
        return self.len

In [None]:
def load_dataset_from(directory, uncased=False):
    df_train = create_dataframe_from_file(os.path.join(directory, 'Train.json'), uncased=uncased)
    df_val = create_dataframe_from_file(os.path.join(directory, 'Val.json'), uncased=uncased)
    df_test = create_dataframe_from_file(os.path.join(directory, 'Test.json'), uncased=uncased)
    
    print("TRAIN Dataset: {}".format(df_train.shape))
    print("VAL Dataset: {}".format(df_val.shape))
    print("TEST Dataset: {}".format(df_test.shape))
    check_test_tags(df_test)
    
    training_set = dataset(df_train, tokenizer, MAX_LEN)
    val_set = dataset(df_val, tokenizer, MAX_LEN)
    testing_set = dataset(df_test, tokenizer, MAX_LEN)
    
    # Define the PyTorch dataloaders
    train_params = {'batch_size': BATCH_SIZE,
                    'shuffle': True,
                    'num_workers': 0
                    }

    val_params = {'batch_size': BATCH_SIZE,
                    'shuffle': False,
                    'num_workers': 0
                    }

    test_params = {'batch_size': BATCH_SIZE,
                    'shuffle': False,
                    'num_workers': 0
                    }
    
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)
    testing_loader = DataLoader(testing_set, **test_params)
    
    return training_loader, val_loader, testing_loader

#### Training

In [None]:
# Helping function for time
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# Use plot styling from seaborn
def plot_stats(df_stats):
    sns.set(style='darkgrid')

    # Increase the plot size and font size
    sns.set(font_scale=1.5)
    plt.rcParams["figure.figsize"] = (12,6)

    # Plot the learning curve
    plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
    plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

    # Label the plot
    plt.title("Training & Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.xticks([1, 2, 3, 4])

    plt.show()

In [None]:
# Summary of the training process
def display_df_stats(training_stats):
    pd.reset_option('^display.')
    pd.set_option('display.precision', 2)

    # Create a DataFrame from the training statistics
    df_stats = pd.DataFrame(data=training_stats)

    # Use the 'epoch' as the row index
    df_stats = df_stats.set_index('epoch')

    # Display the table
    display(df_stats)
    
    plot_stats(df_stats)

In [None]:
# Define training function 
def start_training(model_name, training_loader, val_loader): 
    model = RobertaForTokenClassification.from_pretrained(model_type, num_labels=len(labels_to_ids), token=access_token)
    model.to(device)

    # Define the optimizer
    optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

    random.seed(SEED_VAL)
    np.random.seed(SEED_VAL)
    torch.manual_seed(SEED_VAL)
    torch.cuda.manual_seed_all(SEED_VAL)

    # We'll store a number of quantities such as training and validation loss,
    # validation accuracy, and timings
    training_stats = []

    # Measure the total training time for the whole run
    total_t0 = time.time()

    # For each epoch...
    for epoch_i in range(0, EPOCHS):
        # ========================================
        #               Training
        # ========================================

        # Perform one full pass over the training set

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
        print('Training...')

        # Measure how long the training epoch takes
        t0 = time.time()

        # Reset for this epoch    
        tr_loss, tr_accuracy = 0, 0
        nb_tr_examples, nb_tr_steps = 0, 0
        tr_preds, tr_labels = [], []

        # Put the model into training mode
        model.train()

        # For each batch of training data...
        for idx, batch in enumerate(training_loader):

            # Progress update every 40 batches
            if idx % 40 == 0 and not idx == 0:
                # Calculate elapsed time in minutes
                elapsed = format_time(time.time() - t0)

                # Report progress
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(idx, len(training_loader), elapsed))

            # Unpack this training batch from the dataloader    
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)

            # clear any previously calculated gradients 
            model.zero_grad()

            # Perform a forward pass
            result = model(input_ids=ids, attention_mask=mask, labels=labels)#, return_dict=True)
            loss = result.loss
            tr_loss += loss
            tr_logits = result.logits

            nb_tr_steps += 1
            nb_tr_examples += labels.size(0)

            # compute training accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
            #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            tr_labels.extend(labels)
            tr_preds.extend(predictions)

            tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            tr_accuracy += tmp_tr_accuracy

            # gradient clipping
            torch.nn.utils.clip_grad_norm_(
                parameters=model.parameters(), max_norm=MAX_GRAD_NORM
            )

            # backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


        # Calculate the average loss over all of the batches
        avg_train_loss = tr_loss / len(training_loader)

        # Calculate the average accuracy over all of the batches
        tr_accuracy = tr_accuracy / nb_tr_steps

        # Measure how long this epoch took
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Average training accuracy: {0:.2f}".format(tr_accuracy))

        print("  Training epoch took: {:}".format(training_time))


        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure the performance on
        # the validation set

        print("")
        print("Running Validation...")

        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation
        model.eval()

        # Tracking variables
        eval_loss, eval_accuracy = 0, 0
        nb_eval_examples, nb_eval_steps = 0, 0
        eval_preds, eval_labels = [], []
        best_eval_loss = 1

        # Evaluate data for one epoch
        for batch in tqdm(val_loader, desc="Validation"):

            # Unpack this training batch from dataloader
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)

            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training)
            with torch.no_grad():
                # Forward pass, calculate logit predictions.
                result = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=True)

            loss = result.loss
            eval_logits = result.logits

            # Accumulate the validation loss
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)

            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            labels_list = [ids_to_labels[str(id.item())] for id in labels] 
            predictions_list = [ids_to_labels[str(id.item())] for id in predictions] 

            eval_labels.append(labels_list)
            eval_preds.append(predictions_list)

            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy


        # Report the final accuracy for this validation run
        avg_val_accuracy = eval_accuracy / len(val_loader)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

        # Calculate the average loss over all of the batches
        avg_val_loss = eval_loss / len(val_loader)

        # Measure how long the validation run took
        validation_time = format_time(time.time() - t0)

        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        # Save best model
        if avg_val_loss < best_eval_loss:
            torch.save(model, model_name)
            best_eval_loss = avg_val_loss

        # Record all statistics from this epoch
        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss.item(),
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )
    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
    
    display_df_stats(training_stats)

#### Testing

In [None]:
# Define testing function
def start_testing(model_name, testing_loader):
    # Load best model
    model = torch.load(model_name)
    
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for batch in tqdm(testing_loader, desc="Testing"):

            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            result = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=True)
            loss = result.loss
            eval_loss += loss
            eval_logits = result.logits

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)

            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            labels_list = [ids_to_labels[str(id.item())] for id in labels] 
            predictions_list = [ids_to_labels[str(id.item())] for id in predictions] 

            #eval_labels.extend(labels)
            #eval_preds.extend(predictions)
            
            eval_labels.append(labels_list)
            eval_preds.append(predictions_list)

            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
            
            
    #labels = [ids_to_labels[str(id.item())] for id in eval_labels] 
    #predictions = [ids_to_labels[str(id.item())] for id in eval_preds] 

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Testing Loss: {eval_loss}")
    print(f"Testing Accuracy: {eval_accuracy}")

    print(classification_report(eval_labels, eval_preds))

## Execution

In [None]:
main_dir = 'REP'
folder = 'NO3_REP'
model_name = 'REP_NER_model'

In [None]:
print(f'RUNNING {folder}')
directory = os.path.join('/home/anon/input/final-ner-datasets/Final_NER_datasets', main_dir, folder)
train_l, val_l, test_l = load_dataset_from(directory, uncased=uncased)  
model_name = 'REP_NER_model'
start_training(model_name, train_l, val_l)
start_testing(model_name, test_l)

## Saving & Loading Fine-Tuned Model

In [None]:
model = torch.load(model_name)

In [None]:
output_dir = "/home/anon/working/REP_NER_model_folder"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model = torch.load(model_name)
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
# Create a zip file
output_dir = '/home/anon/working/REP_NER_model_folder'
zip_path = '/home/anon/working/REP_NER_model.zip'

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, _, files in os.walk(output_dir):
        for file in files:
            zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), output_dir))

In [None]:
"""# Load trained model and vocabulary 
zip_path = '/home/anon/working/REP_NER_model.zip'
output_dir = '/home/anon/working/REP_NER_model'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(output_dir)

# Load the tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained(output_dir)

# Load the fine-tuned model
model = RobertaForTokenClassification.from_pretrained(output_dir)

# Copy the model to the GPU.
model.to(device)"""