### Import libraries

In [None]:
import os
import time
import torch
import random
import zipfile
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
from transformers import get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification, AdamW, BertConfig
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


### Setup for GPU

In [None]:
# To confirm that the GPU is detected

import tensorflow as tf

# Get the GPU device name
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

In [None]:
# To identify and specify the GPU

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

### Installing the Hugging Face Library

In [None]:
!pip install transformers

### Setup parameters


In [None]:
# Seed for reproducibility
seed_val = 42

# Number of training epochs. The BERT authors recommend between 2 and 4
epochs = 4

# Define the maximum sequence length for each window
window_length = 512

# The DataLoader needs to know the batch size for training
# For fine-tuning BERT on a specific task, the authors recommend a batch
# size of 16 or 32
batch_size = 16

### Load the labeld dataset

In [None]:
# Load the JSON file into a pandas DataFrame
df = pd.read_json('../input/total-entries-labeled-processed/total_entries_labeled_processed.json')

# Report the number of sentences
print('Number of sentences: {:,}\n'.format(df.shape[0]))

# Display 5 random rows from the data
df.sample(5)

### Split train - validation - test sets

In [None]:
# Split the features and labels using stratified sampling
features_train, features_val_test, labels_train, labels_val_test = train_test_split(
    df['processedContent'],      # Features
    df['CTIrelevant'],           # Labels
    test_size=0.3,               # 30% for validation and test combined
    stratify=df['CTIrelevant'],  # Stratified sampling based on labels
    random_state=seed_val        # Set a random seed for reproducibility
)

# Split the validation and test sets
features_val, features_test, labels_val, labels_test = train_test_split(
    features_val_test, labels_val_test,
    test_size=0.333,             # Split remaining 30% into 20% for validation and 10% for test
    stratify=labels_val_test,    # Stratified sampling based on labels
    random_state=seed_val        # Set a random seed for reproducibility
)

# Create dataframes for each split
df_train = df.loc[features_train.index]
df_val = df.loc[features_val.index]
df_test = df.loc[features_test.index]

In [None]:
# Reset indices for features and labels
df_train = df_train.reset_index()
df_val = df_val.reset_index()
df_test = df_test.reset_index()

features_train = features_train.reset_index(drop=True)
features_val = features_val.reset_index(drop=True)
features_test = features_test.reset_index(drop=True)
labels_train = labels_train.reset_index(drop=True)
labels_val = labels_val.reset_index(drop=True)
labels_test = labels_test.reset_index(drop=True)

# Print the number of samples in each set
print('Training samples:', len(features_train))
print('Validation samples:', len(features_val))
print('Test samples:', len(features_test))

### BERT Tokenizer

In [None]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

### Inspect possible number of tokens

In [None]:
# Get length of all the messages in the dataset
seq_len = [len(tokenizer.tokenize(i)) for i in df['processedContent']]

plt.rcParams["figure.figsize"] = (12,6)

pd.Series(seq_len).hist(bins = 30)

# Display the histogram
plt.show()

In [None]:
# More specifically...
filtered_seq_len = [length for length in seq_len if length < 500]

# Plot the histogram for filtered_seq_len
pd.Series(filtered_seq_len).hist(bins=30)

# Display the histogram
plt.show()

In [None]:
# Example of sentence
sentence = features_train[0]

# Print the original sentence
print(' Original: ', sentence)

# Print the sentence split into tokens
print('Tokenized: ', tokenizer.tokenize(sentence))

# Print the sentence mapped to token ids
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence)))

### Tokenize train set

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs
input_ids = []
attention_masks = []

# For every sentence...
for sample in features_train:
    # `encode_plus` will:
    #   (1) Tokenize the sentence
    #   (2) Prepend the `[CLS]` token to the start
    #   (3) Append the `[SEP]` token to the end
    #   (4) Map tokens to their IDs
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens
    encoded_dict = tokenizer.encode_plus(
                        sample,                       # Sentence to encode
                        add_special_tokens = True,    # Add '[CLS]' and '[SEP]'
                        max_length = window_length,   # Pad & truncate all sentences
                        pad_to_max_length = True,
                        return_attention_mask = True, # Construct attn. masks
                        return_tensors = 'pt',        # Return pytorch tensors
                   )

    # Add the encoded sentence to the list
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding)
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels_train)

# Print sentence 0, now as a list of IDs
print('Original: ', features_train[0])
print('Tokenized: ', tokenizer.tokenize(features_train[0]))
print('Token IDs:', input_ids[0])

In [None]:
# Combine the training inputs into a TensorDataset
train_dataset = TensorDataset(input_ids, attention_masks, labels)

# Print the number of samples in the set
print('{:>5,} training samples'.format(len(train_dataset)))

In [None]:
# Create an iterator for the dataset using the torch DataLoader class

# Create the DataLoaders for the training set
# Take training samples in random order
train_dataloader = DataLoader(
            train_dataset,                          # The training samples
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size                 # Trains with this batch size
        )

### Tokenize validation set

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs
input_ids = []
attention_masks = []

# For every sentence...
for sample in features_val:
    # `encode_plus` will:
    #   (1) Tokenize the sentence
    #   (2) Prepend the `[CLS]` token to the start
    #   (3) Append the `[SEP]` token to the end
    #   (4) Map tokens to their IDs
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens
    encoded_dict = tokenizer.encode_plus(
                        sample,                       # Sentence to encode
                        add_special_tokens = True,    # Add '[CLS]' and '[SEP]'
                        max_length = window_length,   # Pad & truncate all sentences
                        pad_to_max_length = True,
                        return_attention_mask = True, # Construct attn. masks
                        return_tensors = 'pt',        # Return pytorch tensors
                   )

    # Add the encoded sentence to the list
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding)
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels_val)

# Print sentence 0, now as a list of IDs
print('Original: ', features_val[0])
print('Tokenized: ', tokenizer.tokenize(features_val[0]))
print('Token IDs:', input_ids[0])

In [None]:
# Combine the validation inputs into a TensorDataset
val_dataset = TensorDataset(input_ids, attention_masks, labels)

# Print the number of samples in the set
print('{:>5,} validation samples'.format(len(val_dataset)))

In [None]:
# Create an iterator for the dataset using the torch DataLoader class

# For validation the order doesn't matter, so read them sequentially
validation_dataloader = DataLoader(
            val_dataset,                              # The validation samples
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially
            batch_size = batch_size                   # Evaluate with this batch size
        )

### Train Classification Model

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",          # Use the 12-layer BERT model, with an uncased vocab
    num_labels = 2,               # The number of output labels, 2 for binary classification
    output_attentions = False,    # Whether the model returns attentions weights
    output_hidden_states = False, # Whether the model returns all hidden-states
)

# Tell pytorch to run this model on the GPU
model.cuda()

In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                )

In [None]:
# Total number of training steps is [number of batches] x [number of epochs]
# (This is not the same as the number of training samples)
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
# Helper function for calculating accuracy

# Function to calculate the accuracy of the predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# Helper function for formatting elapsed times as hh:mm:ss

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss,
# validation accuracy, and timings
training_stats = []

# Measure the total training time for the whole run
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    # Perform one full pass over the training set

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes
    t0 = time.time()

    # Reset the total loss for this epoch
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes
            elapsed = format_time(time.time() - t0)

            # Report progress
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from the dataloader
        #
        # Also copy each tensor to the GPU using the
        # `to` method
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)


        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because
        # accumulating the gradients is "convenient while training RNNs".
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here:
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For the useage here, it returns
        # the loss (because labels are provided) and the "logits"--the model
        # outputs prior to activation

        result = model(b_input_ids,
                       token_type_ids=None,
                       attention_mask=b_input_mask,
                       labels=b_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits

        # Accumulate the training loss over all of the batches so that it's possible to
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value
        # from the tensor
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc
        optimizer.step()

        # Update the learning rate
        scheduler.step()

    # Calculate the average loss over all of the batches
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure the performance on
    # the validation set

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation
    model.eval()

    # Tracking variables
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    best_eval_loss = 1

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        # Unpack this training batch from our dataloader
        #
        # Also copy each tensor to the GPU using
        # the `to` method
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training)
        with torch.no_grad():

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax
            result = model(b_input_ids,
                           token_type_ids=None,
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)


        loss = result.loss
        logits = result.logits

        # Accumulate the validation loss
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches
        total_eval_accuracy += flat_accuracy(logits, label_ids)


    # Report the final accuracy for this validation run
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    # Measure how long the validation run took
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    
    """if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, 'bert_model')
        best_eval_accuracy = avg_val_accuracy"""
    if avg_val_loss < best_eval_loss:
        torch.save(model, 'bert_model')
        best_eval_loss = avg_val_loss

    # Record all statistics from this epoch
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

### Training evaluation

In [None]:
# Summary of the training process

# Display floats with two decimal places

pd.reset_option('^display.')
pd.set_option('display.precision', 2)

# Create a DataFrame from the training statistics
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index
df_stats = df_stats.set_index('epoch')

# Display the table
df_stats

In [None]:
# Use plot styling from seaborn
sns.set(style='darkgrid')

# Increase the plot size and font size
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

# Label the plot
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])

plt.show()

### Performance On Test Set

In [None]:
# Load best model
model = torch.load('bert_model')

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sample in features_test:
    # `encode_plus` will:
    #   (1) Tokenize the sentence
    #   (2) Prepend the `[CLS]` token to the start
    #   (3) Append the `[SEP]` token to the end
    #   (4) Map tokens to their IDs
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens
    encoded_dict = tokenizer.encode_plus(
                        sample,                         # Sentence to encode
                        add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
                        max_length = window_length,     # Pad & truncate all sentences
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks
                        return_tensors = 'pt',          # Return pytorch tensors
                   )

    # Add the encoded sentence to the list
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding)
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels_test)

# Create the DataLoader
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables
predictions , true_labels = [], []

# Predict
for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)

    # Unpack the inputs from the dataloader
    b_input_ids, b_input_mask, b_labels = batch

    # Telling the model not to compute or store gradients, saving memory and
    # speeding up prediction
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      result = model(b_input_ids,
                     token_type_ids=None,
                     attention_mask=b_input_mask,
                     return_dict=True)

    logits = result.logits

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

print('    DONE.')

print('\nPositive samples: %d of %d (%.2f%%)' % (labels.sum().item(), len(labels), (labels.sum().item() / len(labels) * 100.0)))

### Test evaluation

In [None]:
# Flatten the true labels and predictions
flat_true_labels = np.concatenate(true_labels, axis=0)
flat_predictions = np.concatenate(predictions, axis=0)

flat_predictions_avg = []
# Adjust predictions
for i in range(len(true_labels)):

    # The predictions for this batch are a 2-column ndarray (one column for "0"
    # and one column for "1"). Pick the label with the highest value and turn this
    # in to a list of 0s and 1s
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
    flat_predictions_avg.append(pred_labels_i)
    
flat_predictions_avg = np.concatenate(flat_predictions_avg, axis=0)

# Calculate additional performance indices
precision = precision_score(flat_true_labels, flat_predictions_avg)
recall = recall_score(flat_true_labels, flat_predictions_avg)
f1 = f1_score(flat_true_labels, flat_predictions_avg)
accuracy = accuracy_score(flat_true_labels, flat_predictions_avg)

print('Accuracy: %.3f' % accuracy)
print('Precision: %.3f' % precision)
print('Recall: %.3f' % recall)
print('F1 Score: %.3f' % f1)

In [None]:
# Calculate the confusion matrix
cm = confusion_matrix(flat_true_labels, flat_predictions_avg)

# Display the confusion matrix as a heatmap
plt.figure(figsize=(3, 3))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

In [None]:
# Use MCC here because the classes are imbalanced
matthews_set = []

# Evaluate each test batch using Matthew's correlation coefficient
print('Calculating Matthews Corr. Coef. for each batch...')

# For each input batch...
for i in range(len(true_labels)):

    # The predictions for this batch are a 2-column ndarray (one column for "0"
    # and one column for "1"). Pick the label with the highest value and turn this
    # in to a list of 0s and 1s
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()

    # Calculate and store the coef for this batch
    matthews = matthews_corrcoef(true_labels[i], pred_labels_i)
    matthews_set.append(matthews)

# Create a barplot showing the MCC score for each batch of test samples
ax = sns.barplot(x=list(range(len(matthews_set))), y=matthews_set, errorbar=None)

plt.title('MCC Score per Batch')
plt.ylabel('MCC Score (-1 to +1)')
plt.xlabel('Batch #')

plt.show()

In [None]:
# Combine the results across all batches
flat_predictions = np.concatenate(predictions, axis=0)

# For each sample, pick the label (0 or 1) with the higher score
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list
flat_true_labels = np.concatenate(true_labels, axis=0)

# Calculate the MCC
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print('Total MCC: %.3f' % mcc)

In [None]:
# Combine the results across all batches
flat_predictions = np.concatenate(predictions, axis=0)

# Calculate the probabilities for the positive class
probs = flat_predictions[:, 1]

# Calculate the false positive rate (fpr), true positive rate (tpr), and threshold values using the roc_curve function
fpr, tpr, thresholds = roc_curve(flat_true_labels, probs)

# Calculate the Area Under the Curve (AUC) using the auc function
roc_auc = auc(fpr, tpr)

In [None]:
# Plot the ROC curve
plt.rcParams["figure.figsize"] = (8,5)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

### Inspect prediction

In [None]:
predtiction_sample = []

for index in range(0,len(flat_true_labels)):
    predtiction_sample.append(
        {
            'postID': df_test.loc[index].at["postID"],
            'threadID': df_test.loc[index].at["threadID"],
            'subforumID': df_test.loc[index].at["subforumID"],
            'origin': df_test.loc[index].at["origin"],
            'processedContent': features_test[index],
            'True label': flat_true_labels[index],
            'Predicted label': flat_predictions_avg[index]
        }
    )

In [None]:
# Create a DataFrame from the statistics
df_stats = pd.DataFrame(data=predtiction_sample)

# Create a styler object
styler = df_stats.style.set_table_styles([
    {'selector': 'th', 'props': [('text-align', 'left')]},
    {'selector': 'td', 'props': [('text-align', 'left')]}
])

# Display the table with aligned content
styler

In [None]:
# For a single entry
index = 0
pd.set_option('display.max_colwidth', None)  # Display full content in each cell

predtiction_sample = []
predtiction_sample.append(
        {
            'postID': df_test.loc[index].at["postID"],
            'threadID': df_test.loc[index].at["threadID"],
            'subforumID': df_test.loc[index].at["subforumID"],
            'origin': df_test.loc[index].at["origin"],
            'Sample': features_test[index],
            'True label': flat_true_labels[index],
            'Predicted label': flat_predictions_avg[index]
        }
    )

# Create a DataFrame from the statistics
df_stats = pd.DataFrame(data=predtiction_sample)

# Create a styler object
styler = df_stats.style.set_table_styles([
    {'selector': 'th', 'props': [('text-align', 'left')]},
    {'selector': 'td', 'props': [('text-align', 'left')]}
])

# Display the table with aligned content
styler

### Saving & Loading Fine-Tuned Model

In [None]:
# Save files
output_dir = "/home/anon/Classifier_BERT"
os.makedirs(output_dir, exist_ok=True)
print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
# Create a zip file
output_dir = '/home/anon/Classifier_BERT'
zip_path = '/home/anon/Classifier_BERT.zip'

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, _, files in os.walk(output_dir):
        for file in files:
            zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), output_dir))

In [None]:
# Load a trained model and vocabulary that you have fine-tuned
zip_path = '/home/anon/Classifier_BERT.zip'
output_dir = '/home/anon/Classifier_BERT'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(output_dir)

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained(output_dir)

# Load the fine-tuned model
model = BertForSequenceClassification.from_pretrained(output_dir)

# Copy the model to the GPU.
model.to(device)

### Classification of unseen sample (an example)

In [None]:
txt = "random phrase threat kind sql injection other random word confuse classificator wonder result prediction"           
# Apply tokenization as for the training part
inputs = tokenizer.encode_plus(
    txt,
    add_special_tokens = True,      # Add '[CLS]' and '[SEP]'
    max_length = window_length,     # Pad & truncate all sentences
    pad_to_max_length = True,
    return_attention_mask = True,   # Construct attn. masks
    return_tensors = 'pt',          # Return pytorch tensors
)

inputs = {key: value.to(device) for key, value in inputs.items()}

# Make the prediction
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    
probabilities = torch.softmax(logits, dim=1)
predicted_class = torch.argmax(probabilities, dim=1).item()

# Print the result
pd.set_option('display.max_colwidth', None)  # Display full content in each cell

predtiction_sample = []
predtiction_sample.append(
        {
            #'postID': df_test.loc[index].at["postID"],
            #'threadID': df_test.loc[index].at["threadID"],
            #'subforumID': df_test.loc[index].at["subforumID"],
            #'origin': df_test.loc[index].at["origin"],
            'Sample': txt,
            'Predicted label': predicted_class
        }
    )

# Create a DataFrame from the statistics
df_stats = pd.DataFrame(data=predtiction_sample)

# Create a styler object
styler = df_stats.style.set_table_styles([
    {'selector': 'th', 'props': [('text-align', 'left')]},
    {'selector': 'td', 'props': [('text-align', 'left')]}
])

# Display the table with aligned content
styler