# Download and Encode Data

In [1]:
# install

!pip install transformers transformers[torch] datasets accelerate -U evaluate



In [2]:
# load datasets

import datasets
train_1k = datasets.load_dataset("polyglot_ner", "de", split="train[:1000]")
train_3k = datasets.load_dataset("polyglot_ner", "de", split="train[1000:4000]")
eval = datasets.load_dataset("polyglot_ner", "de", split="train[4000:6000]")

In [3]:
# load pretrained fast tokenizer for bert-base-german-cased model

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")

In [4]:
# encode the labels
# we fit the labels only on the train_1k data as there are only 4

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# flatten ner list for le
tags = [tag for example in train_1k["ner"] for tag in example]
le.fit(tags)

# construct label and index dictionarys
label2id = {}
id2label = {}
for index, label in enumerate(le.classes_):
  label2id[label] = index
  id2label[index] = label

In [5]:
# function for dealing with labels on word and toknization on sub-word level

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True, padding="max_length", max_length=128)

    labels = []
    for i, label in enumerate(examples["ner"]):
        label = le.transform(label) # encode label
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [6]:
# encode data

encoded_train_1k = train_1k.map(tokenize_and_align_labels, batched=True)
encoded_train_3k = train_3k.map(tokenize_and_align_labels, batched=True)
encoded_eval = eval.map(tokenize_and_align_labels, batched=True)

# Train and Evaluate Models

In [7]:
# define evaluation procedure

import evaluate
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np


def compute_metrics(p):
    predictions = p[0]
    labels = p[1]
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # sklearn f1 score expects binary matrix
    preds_labels = true_predictions[:]
    preds_labels.extend(true_labels)
    binarizer = MultiLabelBinarizer().fit(preds_labels)
    binarized_predictions = binarizer.transform(true_predictions)
    binarized_labels = binarizer.transform(true_labels)

    f1_macro = f1_score(binarized_labels, binarized_predictions, average='macro')
    f1_micro = f1_score(binarized_labels, binarized_predictions, average='micro')
    return {
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
    }

## 1k Examples

In [8]:
# train with 1k examples

from transformers import BertForTokenClassification, Trainer, TrainingArguments

model = BertForTokenClassification.from_pretrained('bert-base-german-cased', num_labels=len(le.classes_), id2label=id2label, label2id=label2id)

training_args = TrainingArguments(
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_1k,
    eval_dataset=encoded_eval,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro
1,0.3323,0.131711,0.674563,0.882055
2,0.1073,0.11912,0.739915,0.892598


TrainOutput(global_step=126, training_loss=0.1960027123254443, metrics={'train_runtime': 95.5537, 'train_samples_per_second': 20.931, 'train_steps_per_second': 1.319, 'total_flos': 130650740736000.0, 'train_loss': 0.1960027123254443, 'epoch': 2.0})

In [9]:
# evaluate - or see training results for epoch 2 in most cases
# however, epoch 2 does theoretically not have to be the best model
# so we evaluate the best model loaded at the end of training explicetely

preds = trainer.predict(encoded_eval)
eval = compute_metrics(preds)

print(f"The f1-micro on the evaluation set is {round(eval['f1_micro'] * 100, 2)} % and the f1 macro is {round(eval['f1_macro'] * 100, 2)} %.")

The f1-micro on the evaluation set is 89.26 % and the f1 macro is 73.99 %.


## 3k Examples

In [10]:
# train with 3k examples

model = BertForTokenClassification.from_pretrained('bert-base-german-cased', num_labels=len(le.classes_), id2label=id2label, label2id=label2id)

training_args = TrainingArguments(
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_3k,
    eval_dataset=encoded_eval,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro
1,0.2172,0.101942,0.779709,0.908303
2,0.0736,0.100063,0.78807,0.910547


TrainOutput(global_step=376, training_loss=0.11939250915608507, metrics={'train_runtime': 180.828, 'train_samples_per_second': 33.181, 'train_steps_per_second': 2.079, 'total_flos': 391952222208000.0, 'train_loss': 0.11939250915608507, 'epoch': 2.0})

In [11]:
# evaluate

preds = trainer.predict(encoded_eval)
eval = compute_metrics(preds)

print(f"The f1-micro on the evaluation set is {round(eval['f1_micro'] * 100, 2)} % and the f1 macro is {round(eval['f1_macro'] * 100, 2)} %.")

The f1-micro on the evaluation set is 91.05 % and the f1 macro is 78.81 %.


## 3k Examples and Frozen Embeddings

In [13]:
# train with 3k examples and frozen embeddings

model = BertForTokenClassification.from_pretrained('bert-base-german-cased', num_labels=len(le.classes_), id2label=id2label, label2id=label2id)

# freeze BERT layers and not classifier
for param in model.bert.parameters():
    param.requires_grad = False

training_args = TrainingArguments(
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_3k,
    eval_dataset=encoded_eval,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro
1,1.1116,0.975174,0.449507,0.676398
2,0.9006,0.851619,0.426836,0.719332


TrainOutput(global_step=376, training_loss=1.056278644724095, metrics={'train_runtime': 86.5197, 'train_samples_per_second': 69.348, 'train_steps_per_second': 4.346, 'total_flos': 391952222208000.0, 'train_loss': 1.056278644724095, 'epoch': 2.0})

In [14]:
# evaluate

preds = trainer.predict(encoded_eval)
eval = compute_metrics(preds)

print(f"The f1-micro on the evaluation set is {round(eval['f1_micro'] * 100, 2)} % and the f1 macro is {round(eval['f1_macro'] * 100, 2)} %.")

The f1-micro on the evaluation set is 71.93 % and the f1 macro is 42.68 %.
