## Preparation of the Environment

### Google Colab

In [1]:
#  Installation of the following additional packages
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.11.0-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 4.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 56.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 40.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 45.8 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.6 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: Py

### Local Installation
On a local computer a virtual environment with all needed packages has to be setup. Follow the instructions given on Higgingface [here](https://huggingface.co/course/chapter0?fw=pt).

## Data Import

### Google Colab

In [2]:
# Import from GoogleDrive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import os
os.chdir("//content/gdrive/MyDrive/NLP-Paper/data")

import numpy
import pandas as pd
data = pd.read_csv("data.csv", encoding="UTF-8")

### Local Installation

In [None]:
import os
os.chdir("<Insert the path to your local folder including the data here.>")

import numpy
import pandas as pd
data = pd.read_csv("data.csv", encoding="UTF-8")

## Data Preparation

In [4]:
# Split the data into two pieces, one for training and one for testing
from sklearn.model_selection import train_test_split
train_text_series , test_text_series, train_label_series, test_label_series = train_test_split(data["text"], data["label"], test_size = 0.30, random_state = 42)

# Split the training data set again to additionally get a validation data set for tuning the hyper parameters of the model
#train_text_series, valid_text_series, train_label_series, valid_label_series = train_test_split(train_text_series, train_label_series, test_size = 0.177, random_state = 42)

# Casting the data from series objects into lists (as expected from the tokenizer function below)
train_text = train_text_series.to_list()
#valid_text = valid_text_series.to_list()
test_text = test_text_series.to_list()
train_label = train_label_series.to_list()
#valid_label = valid_label_series.to_list()
test_label = test_label_series.to_list()

# Sample sizes
print("Size of the training dataset: ", len(train_text))
#print("Size of the validation dataset: ", len(valid_text))
print("Size of the test dataset: ", len(test_text))

Size of the training dataset:  1461
Size of the test dataset:  627


## Tokenizing of the Texts

In [7]:
from transformers import AutoTokenizer
import numpy as np

# Definition of the model that will be fine-tuned
#checkpoint = "bert-base-german-cased"
checkpoint = "deepset/gbert-base"
# Getting the tokenizer for the defined model
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Getting the encodings (as tensors for tensorflow) for the texts for training, validation, and testing
train_encodings = dict(tokenizer(train_text, padding=True, truncation=True, return_tensors='np'))
#valid_encodings = dict(tokenizer(valid_text, padding=True, truncation=True, return_tensors='np'))
test_encodings = dict(tokenizer(test_text, padding=True, truncation=True, return_tensors='np'))

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


## Class Weight Calculation

In [None]:
# Calculation of class weights to account for the unbalanced sizes of the classes

unique, counts = numpy.unique(train_label, return_counts=True)
print("Class Frequencies: ", dict(zip(unique, counts)))

class_weight = {0: counts[1]/counts[0], 1: 1.0}
print("Class Weights: ", class_weight)

Class Frequencies:  {0: 100, 1: 1361}
Class Weights:  {0: 13.61, 1: 1.0}


## Fine-Tuning with learning rate optimization

### Definition of the model

In [None]:
# Import of all needed functions and packages
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from utils import F1_metric

# Definition of batch size and number of epochs
batch_size = 8
num_epochs = 3

# Definition of the learning rate scheduler
# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied by the total number of epochs
num_train_steps = (len(train_label) // batch_size) * num_epochs
lr_scheduler = PolynomialDecay(initial_learning_rate=5e-5, end_learning_rate=0., decay_steps=num_train_steps)

# Definition of the optimizer using the learning rate scheduler
opt = Adam(learning_rate=lr_scheduler)

# Definition of the model architecture and initial weights
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# Definition of the loss function
loss = SparseCategoricalCrossentropy(from_logits=True)
# Definition of the full model for training (or fine-tuning)
model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])

Downloading:   0%|          | 0.00/533M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training of the model

In [None]:
model.fit(
    train_encodings,
    np.array(train_label),
    #validation_data=(valid_encodings, np.array(valid_label)),
    class_weight=class_weight,
    batch_size=batch_size,
    epochs=num_epochs
)

Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fbe41948b50>

## Saving and Loading the Model

In [None]:
# After fine-tuning you might want to save the model to re-use it later
model.save_pretrained("hf_model_a4s_i2b.tf")

In [None]:
# To load an already fine-tuned model to directly use it
from transformers import TFAutoModelForSequenceClassification
model = TFAutoModelForSequenceClassification.from_pretrained("hf_model_a4s_i2b.tf")

Some layers from the model checkpoint at hf_model_a4s_i2b.tf were not used when initializing TFBertForSequenceClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at hf_model_a4s_i2b.tf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


## Model Evaluation

In [None]:
import tensorflow as tf
# Calculation of the probabilities for each class
# There is no softmax layer at the top of the models in Hugging Face, therefore
# the probabilities have to be calculated here using the softmax function
test_pred_prob = tf.nn.softmax(model.predict(dict(test_encodings))['logits'])

# Extraction of the respective class number with the highest probability
test_pred_class = np.argmax(test_pred_prob, axis=1)



In [None]:
# Checking the test data results

from sklearn import metrics

# Mean accuracy
print("Mean Accuracy:\n", metrics.accuracy_score(test_label, test_pred_class))

# Confusion matrix
print("Confusion Matrix:\n", metrics.confusion_matrix(test_label, test_pred_class))

# F1 Score
print("F1 Score:\n", metrics.f1_score(test_label, test_pred_class))

# Precision
print("Precision:\n", metrics.precision_score(test_label, test_pred_class))

# Recall
print("Recall:\n", metrics.recall_score(test_label, test_pred_class))

# ROC AUC Score
print("ROC AUC:\n", metrics.roc_auc_score(test_label, test_pred_class))

# Cohen's Kappa Score
print("Cohen's Kappa:\n", metrics.cohen_kappa_score(test_label, test_pred_class))

Mean Accuracy:
 0.9409888357256778
Confusion Matrix:
 [[ 31  15]
 [ 22 559]]
F1 Score:
 0.967965367965368
Precision:
 0.9738675958188153
Recall:
 0.9621342512908778
ROC AUC:
 0.8180236473845693
Cohen's Kappa:
 0.5944018042904349
