# Fine-Tuning BERT models for NER

by Benjamin Kissinger & Andreas Sünder

## Install required packages (only once)

```bash
%pip install -r requirements.txt
```

## Setup

Open up a terminal and run the following commands:

```bash
huggingface-cli login
wandb login
```

In [2]:
import os

os.environ['WANDB_PROJECT'] = 'bert-base-multilingual-cased'
os.environ['WANDB_DISABLED'] = 'true'

model_id = 'bert-base-multilingual-cased'

## Load dataset

In [3]:
from datasets import load_dataset
dataset = load_dataset('textminr/ner_tokenized')

## Process dataset

In [4]:
label_list = ['O', 'AUTHOR', 'DATE']

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, add_prefix_space=True)

In [6]:
def tokenize_and_align_labels(row):
  tokenized_inputs = tokenizer(row['tokens'], truncation=True, is_split_into_words=True)

  labels = []
  for i, label in enumerate(row[f'ner_ids']):
    word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:  # Set the special tokens to -100.
      if word_idx is None:
        label_ids.append(-100)
      elif word_idx != previous_word_idx:  # Only label the first token of a given word.
        label_ids.append(label[word_idx])
      else:
        label_ids.append(-100)
      previous_word_idx = word_idx
    labels.append(label_ids)

  tokenized_inputs['labels'] = labels
  return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

In [7]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [8]:
import evaluate
seqeval = evaluate.load('seqeval')

In [9]:
import numpy as np

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=2)

  true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
  ]
  true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
  ]

  results = seqeval.compute(predictions=true_predictions, references=true_labels)
  return {
    'precision': results['overall_precision'],
    'recall': results['overall_recall'],
    'f1': results['overall_f1'],
    'accuracy': results['overall_accuracy'],
  }

## Train model

In [10]:
id2tag= {
  0: 'O',
  1: 'AUTHOR',
  2: 'DATE',
}

tag2id = {v: k for k, v in id2tag.items()}

In [11]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
  model_id,
  num_labels=len(label_list),
  id2label=id2tag,
  label2id=tag2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import TrainingArguments, Trainer
from datetime import datetime

training_args = TrainingArguments(
  output_dir='./output',
  per_device_eval_batch_size=4,
  per_device_train_batch_size=4,
  # fp16=False,
  # bf16=False,
  learning_rate=2e-5,
  num_train_epochs=1,
  logging_strategy='steps',
  logging_steps=5,
  evaluation_strategy='steps',
  eval_steps=250,
  report_to='none',
  save_strategy='no',
  hub_model_id='textminr/ner-bert',
  push_to_hub=True,
)
  
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_datasets['train'],
  eval_dataset=tokenized_datasets['validation'],
  tokenizer=tokenizer,
  data_collator=data_collator,
  compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/1591 [00:00<?, ?it/s]

In [None]:
trainer.push_to_hub()

## Inference / Test

In [12]:
model_id = 'textminr/ner-multilingual-bert'

In [13]:
from transformers import pipeline
classifier = pipeline(
  'ner',
  model=model_id,
  aggregation_strategy='simple'
)

config.json:   0%|          | 0.00/982 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/709M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [19]:
sentence = "1913 veröffentlichte Georg Biermann die erste Monographie über Lovis Corinth"
classifier(sentence)

[{'entity_group': 'DATE',
  'score': 0.9736704,
  'word': '1913',
  'start': 0,
  'end': 4},
 {'entity_group': 'AUTHOR',
  'score': 0.9918079,
  'word': 'Georg Biermann',
  'start': 21,
  'end': 35},
 {'entity_group': 'AUTHOR',
  'score': 0.59317625,
  'word': 'Lo',
  'start': 63,
  'end': 65}]