### Import libraries

In [None]:
import os
import math
import time
import json
import torch
import zipfile
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch import cuda
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizerFast, RobertaForTokenClassification
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Set the default device to GPU
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

### Setup the pathes

In [None]:
model_input_dir = '/home/anon/input/ner-models/HF_NER_model'  # Directory with the fine-tuned RoBERTa model

df_input_dir = '/home/anon/input/chunks-nlp/HF_chunks_NLP'  # Directory with the posts to label
json_output_dir = '/home/anon/working/outputs'  # Direcotry with the final predicted posts
if not os.path.exists(json_output_dir):
    os.makedirs(json_output_dir)

intermediate_predictions_dir = '/home/anon/working/intermediate_predictions'  # Directory with intermediate predictions
if not os.path.exists(intermediate_predictions_dir):
    os.makedirs(intermediate_predictions_dir)

### Load the model

In [None]:
# Load the tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained(model_input_dir)

# Load the fine-tuned model
model = RobertaForTokenClassification.from_pretrained(model_input_dir)

# Copy the model to the GPU.
model.to(device)

In [None]:
# Define the maximum sequence length for each window - as in the training phase
max_length = 128

In [None]:
# Create list of categories
tags = ["APT", "SECTEAM", "IDTY", "OS", "EMAIL", "LOC", "TIME", "IP", "DOM", "URL", "PROT", 
         "FILE", "TOOL", "MD5", "SHA1", "SHA2", "MAL", "ENCR", "VULNAME", "VULID", "ACT"]
limits = ["B", "I"]  # Annotation scheme
entity_tags = ["O"] + [limit + "-" + tag for tag in tags for limit in limits]

In [None]:
# Create dictionaries
labels_to_ids = {label: str(i) for i,label in enumerate(entity_tags)}  # Maps individual tags to indices
ids_to_labels = {str(i): label for i,label in enumerate(entity_tags)}  # Maps indices to individual tags

### Functions definition

In [None]:
"""
Load the dataset and setup values to start predictions
    Parameters: name of the file to retrieve
    Returns: the created dataframe, boundaries to start fetching for predictions
"""

def load_dataset(file_name):
    # Retrieve pathes
    input_filename = file_name + '_unrelatedContentRemoval_iocDefanging_passiveActiveConversion_pronounsSubjectEllipsisResolution_stopwordsRemoval_internetSlangRemoval_aliasesHandling.json'
    file_path = os.path.join(df_input_dir, input_filename)
    intermediate_predictions_path = os.path.join(intermediate_predictions_dir, file_name) + '.txt'
    
    with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            
    for i, entry in enumerate(data):
        sentence_list = []
        for sentence in entry['content']:
            if sentence == "":
                continue
            sentence_list.append(sentence)
        data[i]['content'] = sentence_list    
            
    df = pd.DataFrame(data)
    # Convert the 'date' column to datetime format
    df['date'] = pd.to_datetime(df['date'])
    # Format the 'date' column as MM-DD-YYYY
    df['date'] = df['date'].dt.strftime('%m-%d-%Y')

    # Report the number of articles
    print('Number of posts: {:,}'.format(df.shape[0]))
    
    new_data = []
    for idx, row in df.iterrows():
        r_ID = row['ID']
        r_postID = row['postID']
        r_threadID = row['threadID']
        r_threadTitle = row['threadTitle']
        r_subforumID = row['subforumID']
        r_subforumTitle = row['subforumTitle']
        r_authorName = row['authorName']
        r_date = row['date']
        r_flatContent = row['flatContent']
        r_origin = row['origin']
        r_CTIrelevant = row['CTIrelevant']
        r_content = row['content']
        r_aliases_list = row['aliases_list']
        for cont, al in zip(r_content, r_aliases_list):
            new_data.append({
                'ID':r_ID,
                'postID':r_postID,
                'threadID':r_threadID,
                'threadTitle':r_threadTitle,
                'subforumID':r_subforumID,
                'subforumTitle':r_subforumTitle,
                'authorName':r_authorName,
                'date':r_date,
                'flatContent':r_flatContent,
                'origin':r_origin,
                'CTIrelevant':r_CTIrelevant,
                'content':cont,
                'aliases_list':al
            })
            
    df = pd.DataFrame(new_data)

    # Report the number of sentences
    print('Number of sentences: {:,}'.format(df.shape[0]))
    
    # Read file with intermediate predictions, count the number of rows and set min_retrieved_data
    max_retrieved_data = df.shape[0]  
    min_retrieved_data = 0

    if os.path.isfile(intermediate_predictions_path):
        with open(intermediate_predictions_path, 'r') as file:
            min_retrieved_data = sum(1 for _ in file)
    print("Resume prediction from row #", min_retrieved_data)
    
    return df, max_retrieved_data, min_retrieved_data

In [None]:
"""
Define the steps to tokenize the input and perform classification
    Parameters: name of the file to retrieve, dataframe on which operating, boundaries to start fetching for predictions
    Returns: Nothing -> Predictions are stored in the file intermediate_predictions_path
"""
def make_extraction(file_name, df, max_retrieved_data, min_retrieved_data):    
    # Retrieve path
    intermediate_predictions_path = os.path.join(intermediate_predictions_dir, file_name) + '.txt'

    if min_retrieved_data >= max_retrieved_data:
            print("Entity extraction already completed")
            return

    df_slice = df.loc[min_retrieved_data:max_retrieved_data]
    len_df = len(df_slice)

    with open(intermediate_predictions_path, 'a') as file:
        for idx, row in tqdm(df_slice.iterrows(), total=len_df, desc='Inference'):
            sentence = row['content'].lower()
            
            inputs = tokenizer(sentence.strip().split(),
                                is_split_into_words=True,
                                return_offsets_mapping=True,
                                padding='max_length',
                                truncation=True,
                                max_length=max_length,
                                return_tensors="pt")

            # move to gpu
            ids = inputs["input_ids"].to(device)
            mask = inputs["attention_mask"].to(device)

            # forward pass
            outputs = model(ids, attention_mask=mask)
            logits = outputs[0]

            active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

            tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
            token_predictions = [ids_to_labels[str(i)] for i in flattened_predictions.cpu().numpy()]
            wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

            prediction = []
            for token_pred in wp_preds:
                if token_pred[0][0] == 'Ġ':  # Character identifying 1st token of a word
                    prediction.append(token_pred[1])
            padded_prediction = prediction + ['O'] * (len(sentence.split()) - len(prediction))  # Tokens padding in case of truncation

            # Save intermediate predictions
            if idx > 0:
                file.write('\n' + ','.join(padded_prediction))
            else:
                file.write(','.join(padded_prediction))

In [None]:
"""
Show the statistics of the extracted entities and save them in a json file then release the dataframe
    Parameters: name of the file to retrieve, dataframe on which operating, upper boundary
    Returns: Nothing -> Entities are stored in the file intermediate_predictions_path
"""
def save_predictions(file_name, df, max_retrieved_data):
    # Retrieve pathes
    intermediate_predictions_path = os.path.join(intermediate_predictions_dir, file_name) + '.txt'
    json_output_path = os.path.join(json_output_dir, file_name) + '.json'
    
    # After all entities are predicted, retrieve all their tokens
    with open(intermediate_predictions_path, 'r') as file:
        lines = file.readlines()
    res = [row.strip() for row in lines]

    # Create field for tags
    df['tags'] = pd.Series(res)
    
    # Group the DataFrame by 'ID', 'date', and 'origin' and concatenate the 'content' strings
    df = df.groupby(['ID', 'postID', 'threadID', 'threadTitle', 'subforumID', 'subforumTitle', 'authorName', 'date', 'flatContent', 'origin', 'CTIrelevant']).agg({'content': list, 'tags': list, 'aliases_list': list}).reset_index()
    
    # Save files
    print("Saving json to %s\n" % json_output_path)

    # Save the DataFrame to a JSON file in the specified directory
    df.to_json(json_output_path, orient='records', indent=4)
    
    # Free resources
    del df

### Execution

In [None]:
def execute_code(filename):
    print("Retrieve", filename)
    
    # Check if an output_json already exists for the given file -> if yes, skip to the next one
    json_output_path = os.path.join(json_output_dir, filename) + '.json'
    if os.path.isfile(json_output_path):
        print("File already processed, skipped\n")
        return
        
    # Load the dataset from df_input_dir
    df, max_retrieved_data, min_retrieved_data = load_dataset(filename)
    
    # Entity extraction
    make_extraction(filename, df, max_retrieved_data, min_retrieved_data)
    
    # Save predictions
    save_predictions(filename, df, max_retrieved_data)

In [None]:
choose_interval = True  # True if one wants to execute a specified interval of chunks

In [None]:
if choose_interval:
    for i in range(1,168):  
        filename = 'HF_{0}'.format(i)
        execute_code(filename)

In [None]:
if not choose_interval:
    for filename in os.listdir(df_input_dir):
        execute_code(filename)

In [None]:
# Directory path containing the intermediate files to zip
source_directory = intermediate_predictions_dir

# Zip file path
zip_file_path = '/home/anon/working/intermediate_predictions.zip'

# Create a ZIP archive
with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Loop through all files in the directory and add them to the ZIP archive
    for root, _, files in os.walk(source_directory):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, source_directory))

In [None]:
# Directory path containing the output files to zip
source_directory = json_output_dir

# Zip file path
zip_file_path = '/home/anon/working/json_output.zip'

# Create a ZIP archive
with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Loop through all files in the directory and add them to the ZIP archive
    for root, _, files in os.walk(source_directory):
        for file in files:
            file_path = os.path.join(root, file)
            zipf.write(file_path, os.path.relpath(file_path, source_directory))