https://www.analyticsvidhya.com/blog/2022/06/fine-tune-bert-model-for-named-entity-recognition-in-google-colab/

## Install Required Libraries

In [None]:
# We need to install the necessary libraries to work with the HuggingFace transformer
# datasets library to fetch data
# tokenizers to preprocess the data
# transformers to fine-tune the models
# seqeval to compute model metrics

!pip install datasets -q
!pip install tokenizers -q
!pip install transformers -q
!pip install seqeval -q

## Load English dataset

We will be using an English language NER dataset from the HuggingFace datasets module. It follows the BIO (Beginning, Inside, Outside) format for tagging sentence tokens for the Named Entity Recognition task.

The dataset contains 3 sets of data, train, validation, and test. It consists of tokens, ner_tags, langs, and spans. The ner_tags have ids corresponding to BIO format, I-TYPE, which means the word is inside a phrase of type TYPE. Only if two phrases of the same type immediately follow each other, the first word of the second phrase will have the tag B-TYPE to show that it starts a new phrase. A word with the tag O is not part of a phrase.

There is a total of 4 classes, Person(PER), Organization(ORG), Location(LOC), and others(O).

In [None]:
from datasets import load_dataset

dataset = load_dataset("wikiann", "en")

In [None]:
dataset.keys()

In [None]:
label_names = dataset["train"].features["ner_tags"].feature.names
label_names

In [None]:
type(dataset['train'])

In [None]:
dataset.column_names

In [None]:
dataset.shape

In [None]:
dataset['train']

In [None]:
dataset['train'][:2]

## Data Preprocessing

- Bert expects input in `input_ids`, `token_type_ids` and `attention_mask` format
- The label also requires adjustment due to subword tokenization used by BERT

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

### Let's see why we need to adjust the labels

- We will process the tokens using tokenizer object

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["tokens"], padding="max_length", truncation=True, is_split_into_words=True)

In [None]:
tokenized_datasets_ = dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets_['train'][0]['input_ids'][:20]

In [None]:
tokenized_datasets_['train'][0]['ner_tags'][:20]

In [None]:
len(tokenized_datasets_['train'][0]['input_ids']) == len(tokenized_datasets_['train'][0]['ner_tags'])

- We can see that len of `input_ids` is not matching with `ner_tags` that's why we require to adjust the labels according to the tokenized output

<hr/>

- We will use the argument truncation=True (to truncate texts that are bigger than the maximum size allowed by the model) as there is a sequence in data which has length>512

In [None]:
#Get the values for input_ids, attention_mask, adjusted labels
def tokenize_adjust_labels(all_samples_per_split):
  tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"], is_split_into_words=True, truncation=True)

  total_adjusted_labels = []

  for k in range(0, len(tokenized_samples["input_ids"])):
    prev_wid = -1
    word_ids_list = tokenized_samples.word_ids(batch_index=k)
    existing_label_ids = all_samples_per_split["ner_tags"][k]
    i = -1
    adjusted_label_ids = []

    for word_idx in word_ids_list:
      # Special tokens have a word id that is None. We set the label to -100 so they are automatically
      # ignored in the loss function.
      if(word_idx is None):
        adjusted_label_ids.append(-100)
      elif(word_idx!=prev_wid):
        i = i + 1
        adjusted_label_ids.append(existing_label_ids[i])
        prev_wid = word_idx
      else:
        label_name = label_names[existing_label_ids[i]]
        adjusted_label_ids.append(existing_label_ids[i])

    total_adjusted_labels.append(adjusted_label_ids)

  #add adjusted labels to the tokenized samples
  tokenized_samples["labels"] = total_adjusted_labels
  return tokenized_samples

tokenized_dataset = dataset.map(tokenize_adjust_labels, batched=True, remove_columns=['tokens', 'ner_tags', 'langs', 'spans'])

- To understand word ids, consider following example

In [None]:
out = tokenizer("Fine tune NER in google colab!")
out

In [None]:
out.word_ids(0)

Here, we can see 2 and 5 ids are repeated twice due to sub-word tokenization



- We will now have all the required fields for training, 'input_ids', 'token_type_ids', 'attention_mask', 'labels'

In [None]:
tokenized_dataset

In [None]:
tokenized_dataset['train'][:2]

- As we can see, different sample have different length therefore we need to
pad the tokens to have same length

- https://huggingface.co/docs/transformers/main/main_classes/data_collator#transformers.DataCollatorForTokenClassification

In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
data_collator

## Fine Tuning

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForTokenClassification, AdamW

In [None]:
#check if gpu is present
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

- We will use Distillbert-base-uncased model for fine tuning
- We need to specify the number of labels present in the dataset

In [None]:
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_names))
model.to(device)

- Create a function to generate metrics
- We will use `seqeval` metrics, commonly used for token classification

In [None]:
# !pip install seqeval -q

In [None]:
import numpy as np
from datasets import load_metric
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p

    #select predicted index with maximum logit for each token
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
example = dataset["train"][1]
labels = [label_names[i] for i in example[f"ner_tags"]]
metric.compute(predictions=[labels], references=[labels])

- Fine Tuning using Trainer API

In [None]:
! pip install -U accelerate
! pip install -U transformers

In [None]:
from transformers import TrainingArguments, Trainer

batch_size = 16
logging_steps = len(tokenized_dataset['train']) // batch_size
epochs = 2

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/bert-fine-tune-ner/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train_dataset[0]

In [None]:
trainer.eval_dataset[0]

In [None]:
#fine tune using train method
trainer.train()

In [None]:
trainer.evaluate()

To get the precision/recall/f1 computed for each category for test set, we can apply the same function as before on the result of the `predict` method:

In [None]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)
# Remove ignored index (special tokens)
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
results = metric.compute(predictions=true_predictions, references=true_labels)
results

## Observations

- f1 score for LOC and PER is >85% and ORG has <75%
- Overall f1 score is ~83%
- We can improve the accuracy by training the model for more number of epochs (currently only 2 epochs)