In [None]:
# finetune some bert on germeval 2014, we can use germeval from huggingface datasets
# data is checked for similarity with official dataset available for download

In [None]:
# install additionally needed packages
!pip install datasets
!pip install transformers
!pip install seqeval

In [None]:
import numpy as np

from datasets import load_dataset, load_metric
import transformers
from transformers import DataCollatorForTokenClassification, AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer


In [None]:
# load dataset
datasets = load_dataset("germeval_14")

In [None]:
# define task 
task = "ner" 
# define base model that is used
model_checkpoint = "distilbert-base-german-cased"
model_suffix = "finetuned-germeval14-german"
# tune for training
batch_size = 16
learning_rate = 2e-5
num_epochs = 3

In [None]:
# germeval has 25 labels, however we only want to use the 9 from conll03, will correct for this later
datasets["train"].features[f"ner_tags"]

In [None]:
label_list = datasets["train"].features[f"{task}_tags"].feature.names
label_list

In [None]:
# instantiate tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [None]:
# see how tokenizer is working
tokenizer("Hallo ich bin Markus aus Köln!")

In [None]:
# restructuring labels is used to reduce the 24 categories to the same 9 used in conll03

label_all_tokens = True

def restructure_labels(l):
    """
    helper function to bring labels into the form of conll03: 
    ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
    """
    if l in [0]:
        return 0
    elif l in [1, 3, 5]:
        return 5
    elif l in [2, 4, 6]:
        return 6
    elif l in [7, 9, 11]:
        return 3
    elif l in [8, 10, 12]:
        return 4
    elif l in [19, 21, 23]:
        return 1
    elif l in [20, 22, 24]:
        return 2
    elif l in [13, 15, 17]:
        return 7 
    elif l in [14, 16, 18]:
        return 8
    else:
        return 0


def tokenize_and_align_labels(examples):
    """
    helper function for restructuring datasets
    """
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)
        
        labels_restructured = [] 
        for l in labels:
          labels_restructured.append([restructure_labels(e) for e in l])

    tokenized_inputs["labels"] = labels_restructured
    return tokenized_inputs

In [None]:
# prepare our datasest
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

In [None]:
# instantiate model
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=9)

In [None]:
# set training arguments, batch size is defined at beginning of script
args = TrainingArguments(
    f"checkpoints-{model_checkpoint}-{model_suffix}",
    evaluation_strategy = "epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
)

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
# load metrics and define our custom label_list
metric = load_metric("seqeval")
my_label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [my_label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [my_label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [None]:
# instantiate trainer module, use tokenized datasets and our custom function
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# see training output
trainer.train()

In [None]:
# get evaluation results
trainer.evaluate()

In [None]:
trainer.save_model(f"{model_checkpoint}-{model_suffix}")

In [None]:
# have a look at the results per entity group
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [my_label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [my_label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

# Test our new finetuned model

In [None]:
from transformers import pipeline

# tokenizer = AutoTokenizer.from_pretrained("test-ner-2/checkpoint-2000", use_fast=True)
# model = AutoModelForTokenClassification.from_pretrained("test-ner-2/checkpoint-2000")

nlp = pipeline("ner", f"{model_checkpoint}-{model_suffix}", grouped_entities=True)

In [None]:
text = "Hallo ich bin Markus Nutz und arbeite bei OBI Köln-Mülheim, ich komme aus Köln"

[e for e in nlp(text)]