# Binary Classification of Moralizing Sequences using BERT

## Hardware
Prepare the GPU.

In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## Preparation
Load data into the desired format and ready the Bert Classification model.

In [None]:
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW, BertConfig
from torch.utils.data import DataLoader, TensorDataset, random_split
import json
import random

seed_val = 1998
random.seed(seed_val)

In [None]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-german-cased',
                                                      num_labels=2,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
with open("/content/full_dataset.json", "r", encoding="utf-8") as file:
    data = json.load(file)

texts = [entry[1] for entry in data]
genres = [entry[0] for entry in data]
labels = [entry[2] for entry in data]

Deal with data imbalance.

In [None]:
# Check data imbalance
print(labels.count(1)/len(labels))

In [None]:
# Separate data based on labels
label_0_data = [entry for entry in data if entry[2] == 0]
label_1_data = [entry for entry in data if entry[2] == 1]

# Determine the number of samples to keep for label 0
undersampled_label_0_data = random.sample(label_0_data, len(label_1_data))

# Combine the undersampled label 0 data with the original label 1 data
balanced_data = undersampled_label_0_data + label_1_data

# Shuffle the balanced dataset to randomize the order
random.shuffle(balanced_data)

# Separate into texts, genres, and labels again
texts = [entry[1] for entry in balanced_data]
genres = [entry[0] for entry in balanced_data]
labels = [entry[2] for entry in balanced_data]

print("Balance achieved: ", labels.count(1)/len(labels))

OPTIONAL: We might want to use text-based genre labels instead of numeric ones.

In [None]:
def get_genre_string(genre_nbr):
    translation_dict = {
        1: "Nachrichten über Gerichtsurteile",
        2: "Interviews in Zeitungen",
        3: "Kolumnen und Kommentare in Zeitungen",
        4: "Leserbriefe in Zeitungen",
        5: "Plenarprotokolle",
        6: "Wikipedia-Forum online",
        7: "Sachbücher"
    }
    return translation_dict[genre_nbr]

for index, entry in enumerate(genres):
    genres[index] = get_genre_string(entry)

Tokenization. This is also where the genre labels are concatenated with the rest
of the sequence. If this is not desired, comment out the code that does that.

In [None]:
tokenized_texts = []

for index, sents in enumerate(texts):
    span = " ".join(sents)

    # The following line dictates whether genre labels are added to the data
    #span = str(genres[index]) + " [SEP] " + span

    tokenized_sents = tokenizer.encode(span, max_length=256, truncation=True, padding='max_length')
    tokenized_texts.append(tokenized_sents)

attention_masks = []
for vector in tokenized_texts:
    attention = [1 if token != 0 else 0 for token in vector]
    attention_masks.append(attention)

Split dataset and transform into tensors.

In [None]:
from sklearn.model_selection import train_test_split

# Use 90% for training and 10% for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    tokenized_texts, labels, random_state=seed_val, test_size=0.1)
# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(
    attention_masks, labels, random_state=seed_val, test_size=0.1)

In [None]:
# Convert all inputs and labels into torch tensors, the required datatype
# for our model.
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 16

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

Choose model parameters.

In [None]:
# Set Up Optimizer
learning_rate = 3e-5
epochs = 4
warmup_steps = 0

In [None]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=learning_rate)

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = warmup_steps,
                                            num_training_steps = total_steps)

In [None]:
model.cuda()

## Training Loop

The following functions allows us to track the time the model is training for.

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

Now comes the actual training.

In [None]:
from sklearn.metrics import accuracy_score
# This training code is based on this notebook here:
# https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/classification/BERT_Fine_Tuning_Sentence_Classification_v2.ipynb
# Which in turn is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

for epoch_i in range(0, epochs):

    #=================Training=======================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically.
        model.zero_grad()

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        outputs = model(b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask,
                    labels=b_labels)

        # The call to `model` always returns a tuple, so we need to pull the
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

    #=================Validation=======================

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)

        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs.logits

        # Move logits and labels to CPU
        logits = torch.argmax(logits, dim=1).cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = accuracy_score(logits, label_ids)

        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")

## Saving the Model


In [None]:
from google.colab import files

savefiles = "bert_nogenre_moredata"

model.save_pretrained(savefiles)

In [None]:
!zip -r bert_nogenre_moredata.zip bert_nogenre_moredata/.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

In [None]:
!cp bert_nogenre_moredata.zip '/content/gdrive/My Drive/'
!ls -lt '/content/gdrive/My Drive/'

## Evaluation

Plot loss...

In [None]:
import matplotlib.pyplot as plt

import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12, 6)

# Plot the learning curve starting from epoch 1.
plt.plot(range(1, len(loss_values) + 1), loss_values, 'b-o')

# Label the plot.
plt.title("Training loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")

# Set x-axis ticks and labels starting from 1.
plt.xticks(range(1, len(loss_values) + 1))

plt.show()

Accuracy, precision, recall, F1

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


# Evaluate the model on the validation set
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        input_ids, attention_mask, labels = batch

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits

        predictions = torch.argmax(logits, dim=1).cpu().numpy()
        all_preds.extend(predictions)
        all_labels.extend(labels.cpu().numpy())

# Calculate metrics including precision, recall, and F1-score
accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')


## Custom Tests
The following code allows you to see how example sentences will be labeled.

In [None]:
def run_model(model, tokenizer, sentence, probability):

    model.eval()

    tokens = tokenizer.encode(sentence, max_length=256, truncation=True, padding='max_length')
    attention_mask = [int(token > 0) for token in tokens]

    tokens = torch.tensor(tokens).unsqueeze(0)  # Add batch dimension
    attention_mask = torch.tensor(attention_mask).unsqueeze(0)  # Add batch dimension
    tokens = tokens.to(device)
    attention_mask = attention_mask.to(device)

    # Make the prediction
    with torch.no_grad():
        outputs = model(tokens, attention_mask=attention_mask)
        logits = outputs.logits

    # Get the predicted class (assuming it's a binary classification task)
    # Apply softmax activation
    probabilities = torch.nn.functional.softmax(logits, dim=1)

    if probability:
        return probabilities
    else:
        return torch.argmax(probabilities, dim=1).item()


def label_sentence(model, tokenizer, sentence,
                   genre=None, probability=False):

    if genre is not None:
        return run_model(model, tokenizer,
                         str(genre) + " [SEP] " + sentence,
                         probability)
    else:
        genre_ratings = []
        for n in range(1, 8):
            try:
                genre = get_genre_string(n)
            except:
                genre = n
            genre_ratings.append(run_model(model, tokenizer,
                                           str(genre) + " [SEP] " + sentence,
                                           probability))

        return genre_ratings


In [None]:
sequence = 'Wir müssen mal.'
result = run_model(model, tokenizer, sequence, False)
#print(result)

if result:
    print(f"Moralisierung")
else:
    print("Keine Moralisierung")

You can also retrieve all error on the validation set...

In [None]:
model.eval()

all_results_val = []

with torch.no_grad():
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        input_ids, attention_mask, labels = batch

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        predictions = torch.argmax(logits, dim=1).cpu().numpy()

        for index, input_id in enumerate(input_ids):
            all_results_val.append((tokenizer.decode(input_id), labels[index].item(), predictions[index]))

for result in all_results_val:
    if result[1] != result[2]:
        print(result[1], result[2], result[0])

or everything the model classified correctly...

In [None]:
model.eval()

all_results_val = []

with torch.no_grad():
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        input_ids, attention_mask, labels = batch

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        predictions = torch.argmax(logits, dim=1).cpu().numpy()

        for index, input_id in enumerate(input_ids):
            all_results_val.append((tokenizer.decode(input_id), labels[index].item(), predictions[index]))

for result in all_results_val:
    if result[1] == result[2]:
        print(result[1], result[2], result[0])