### Import libraries

In [None]:
import os
import math
import time
import json
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset

import warnings
warnings.filterwarnings("ignore")

### Setup for GPU

In [None]:
# To identify and specify the GPU

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

### Setup the pathes

In [None]:
model_input_dir = '../input/classifier-bert'  # Directory with the fine-tuned BERT model

df_input_dir = '/home/anon/input/chunks-other-entries-not-labeled-processed'  # Directory with the posts to label
json_output_dir = '/home/anon/working/outputs'  # Direcotry with the final predicted posts
if not os.path.exists(json_output_dir):
    os.makedirs(json_output_dir)

intermediate_predictions_dir = '/home/anon/working/intermediate_predictions'  # Directory with intermediate predictions
if not os.path.exists(intermediate_predictions_dir):
    os.makedirs(intermediate_predictions_dir)

### Load the model

In [None]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(model_input_dir)

# Load the fine-tuned model
model = BertForSequenceClassification.from_pretrained(model_input_dir)

# Copy the model to the GPU.
model.to(device)

In [None]:
# Define the maximum sequence length for each window - as in the training phase
window_length = 512

# The DataLoader needs to know the batch size
batch_size = 64  

### Functions definition

In [None]:
"""
Load the dataset and setup values to start predictions
    Parameters: name of the file to retrieve
    Returns: the created dataframe, boundaries to start fetching for predictions
"""
def load_dataset(file_name):
    # Retrieve pathes
    file_path = os.path.join(df_input_dir, file_name)
    intermediate_predictions_path = os.path.join(intermediate_predictions_dir, file_name.split('.')[0]) + '.txt'
    
    # Load the JSON file into a pandas DataFrame
    df = pd.read_json(file_path)

    # Report the number of sentences
    print('Number of sentences: {:,}\n'.format(df.shape[0]))
    
    # Read file with intermediate predictions, count the number of rows and set min_retrieved_data
    max_retrieved_data = df.shape[0] 
    min_retrieved_data = 0

    if os.path.isfile(intermediate_predictions_path):
        with open(intermediate_predictions_path, 'r') as file:
            min_retrieved_data = sum(1 for _ in file)
    print("Resume prediction from row #", min_retrieved_data)
    
    return df, max_retrieved_data, min_retrieved_data

In [None]:
"""
Define the steps to tokenize the input and perform classification
    Parameters: name of the file to retrieve, dataframe on which operating, boundaries to start fetching for predictions
    Returns: Nothing -> Predictions are stored in the file intermediate_predictions_path
"""
def make_classification(file_name, df, max_retrieved_data, min_retrieved_data):
    if min_retrieved_data >= max_retrieved_data:
        print("Classification already completed")
        return
    
    # Retrieve path
    intermediate_predictions_path = os.path.join(intermediate_predictions_dir, file_name.split('.')[0]) + '.txt'
    
    input_ids = []
    attention_masks = []

    # Calculate the number of batches based on the range
    num_batches = math.ceil((max_retrieved_data - min_retrieved_data) / batch_size)

    # Evaluate time elapsed
    start = time.time()
    print("Start tokenization")

    # Populate the inputs_list and indexes_list
    #for batch_index in range(num_batches):
    for batch_index in tqdm(range(num_batches), desc="Tokenization"):
        start_index = min_retrieved_data + (batch_index * batch_size)
        end_index = min(start_index + batch_size -1, max_retrieved_data-1)

        batch_input_texts = df.loc[start_index:end_index, 'processedContent'].tolist()

        # Encode the batch of input texts
        batch_inputs = tokenizer.batch_encode_plus(
            batch_input_texts,
            add_special_tokens=True,
            max_length=window_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        # Append the encoded input ids and attention masks to the respective lists
        input_ids.append(batch_inputs['input_ids'])
        attention_masks.append(batch_inputs['attention_mask'])

    # Convert the lists into tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    # Create the DataLoader
    prediction_data = TensorDataset(input_ids, attention_masks)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

    print("Elapsed time for tokenization ", time.time() - start)
    print("Start classification")

    # Iterate over the data_loader
    #for batch in prediction_dataloader:
    for batch in tqdm(prediction_dataloader, desc="Processing"):

        predictions = []

        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from the dataloader
        b_input_ids, b_input_mask = batch

        # Speeding up prediction
        with torch.no_grad():
          # Forward pass, calculate logit predictions
          result = model(b_input_ids,
                         token_type_ids=None,
                         attention_mask=b_input_mask,
                         return_dict=True)

        logits = result.logits

        # Move logits to CPU
        logits = logits.detach().cpu().numpy()

        # Store predictions
        predictions.append(logits)

        # Save intermediate predictions in a txt file
        temp = []
        res = []
        temp = np.concatenate(predictions, axis=0)

        for i in range(0, len(temp)):
            # The predictions for this batch are a 2-column ndarray (one column for "0"
            # and one column for "1"). Pick the label with the highest value and turn this
            # in to a list of 0s and 1s
            pred_labels_i = np.argmax(temp[i], axis=0).flatten()
            res.append(pred_labels_i[0])

        with open(intermediate_predictions_path, 'a') as file:
                np.savetxt(file, res, fmt='%d')
                #print("Batch saved")

    end = time.time()
    print("Elapsed time for classification: [s]", round(end - start, 2))

In [None]:
"""
Show the statistics of the classified posts and save them in a json file then release the dataframe
    Parameters: name of the file to retrieve, dataframe on which operating, upper boundary
    Returns: Nothing -> Predictions are stored in the file intermediate_predictions_path
"""
def save_predictions(file_name, df, max_retrieved_data):
    # Retrieve pathes
    intermediate_predictions_path = os.path.join(intermediate_predictions_dir, file_name.split('.')[0]) + '.txt'
    json_output_path = os.path.join(json_output_dir, file_name)
    
    # After all posts are predicted, retrieve all their labels
    res = np.loadtxt(intermediate_predictions_path, dtype=int)

    # Analyze labels
    count_ones = np.count_nonzero(res == 1)
    count_zeros = np.count_nonzero(res == 0)

    print("Number of not CTI-relevant posts: ", count_zeros)
    print("Number of CTI-relevant posts: ", count_ones)

    # Update CTIrelevant field
    df.loc[:max_retrieved_data - 1, 'CTIrelevant'] = res
    
    # Save files
    print("Saving json to %s\n" % json_output_path)

    # Save the DataFrame to a JSON file in the specified directory
    df.to_json(json_output_path, orient='records', indent=4)
    
    # Free resources
    del df

### Execution

In [None]:
def execute_code(filename):
    print("Retrieve", filename)
    
    # Check if an output_json already exists for the given file -> if yes, skip to the next one
    json_output_path = os.path.join(json_output_dir, filename)
    if os.path.isfile(json_output_path):
        print("File already processed, skipped")
        return
        
    # Load the dataset from df_input_dir
    df_unlabeled, max_retrieved_data, min_retrieved_data = load_dataset(filename)
    
    # Classification
    make_classification(filename, df_unlabeled, max_retrieved_data, min_retrieved_data)
    
    # Save predictions
    save_predictions(filename, df_unlabeled, max_retrieved_data)

In [None]:
choose_interval = True  # True if one wants to execute a specified interval of chunks

In [None]:
if choose_interval:
    for i in range(16,23):
        idx = str(i)
        filename = 'chunk'+idx+'_other_entries_not_labeled_processed.json'
        execute_code(filename)

In [None]:
if not choose_interval:
    for filename in os.listdir(df_input_dir):
        execute_code(filename)