In [None]:
!pip install transformers datasets seqeval evaluate accelerate

In [None]:
import os

print(f'Current working directory: {os.getcwd()}')
# os.chdir(os.path.dirname(os.getcwd()))
# print(f'Current working directory: {os.getcwd()}')
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
# notebook will reload external python modules;
%autoreload 2

Current working directory: /content
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load data

In [None]:
import datasets
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification

conll2003 = datasets.load_dataset("conll2003")





  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
conll2003

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
classes = conll2003["train"].features['ner_tags'].feature.names
print([f'{i}: {f}' for i,f in enumerate(classes)])

['0: O', '1: B-PER', '2: I-PER', '3: B-ORG', '4: I-ORG', '5: B-LOC', '6: I-LOC', '7: B-MISC', '8: I-MISC']


### Word embedding

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [None]:
example_text = conll2003['train'][0]

example_text['tokens']

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [None]:
example_text['ner_tags']

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [None]:
tokenizer(example_text["tokens"], is_split_into_words=True)

{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenized_input = tokenizer(example_text["tokens"], is_split_into_words=True)

tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

['[CLS]',
 'eu',
 'rejects',
 'german',
 'call',
 'to',
 'boycott',
 'british',
 'lamb',
 '.',
 '[SEP]']

In [None]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

word_ids = tokenized_input.word_ids()

print(word_ids)

tokenized_input

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]


{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

With BERT tokenization, we need to handle the 2 new tokens which are added at the beginning ([CLS]) and end ([SEP]) of a sentence. We need to make sure this embedding is aligned with the NER labels.

Additionaly, the tokenizer uses sub-word tokenization, so we need to define a strategy to handle words in the dataset that are an aggregate of numerous subwords in the embedding. We can handle this here by setting the same label for all subwords of a word.

To make PyTorch ignore certain tokens in the loss computation, we can set their label to -100 which corresponds to the default *ignore_index* value.

We can use the following function (inspiration: https://github.com/rohan-paul/MachineLearning-DeepLearning-Code-for-my-YouTube-Channel/blob/master/NLP/YT_Fine_tuning_BERT_NER_v1.ipynb) to process the *tokenized_inputs* and add a *labels* key to the dictionary which solves the alignment issue.

In [None]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    """
    Function to tokenize and align labels with respect to the tokens. This function is specifically designed for
    Named Entity Recognition (NER) tasks where alignment of the labels is necessary after tokenization.

    Parameters:
    examples (dict): A dictionary containing the tokens and the corresponding NER tags.
                     - "tokens": list of words in a sentence.
                     - "ner_tags": list of corresponding entity tags for each word.

    label_all_tokens (bool): A flag to indicate whether all tokens should have labels.
                             If False, only the first token of a word will have a label,
                             the other tokens (subwords) corresponding to the same word will be assigned -100.

    Returns:
    tokenized_inputs (dict): A dictionary containing the tokenized inputs and the corresponding labels aligned with the tokens.
    """
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
        # Special tokens like `<s>` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenize_and_align_labels(conll2003['train'][0:1])

{'input_ids': [[101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]]}

### Tokenize the entire dataset.

In [None]:
tokenized_datasets = conll2003.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]



In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

### Load the model

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(classes))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [None]:
model.state_dict

<bound method Module.state_dict of BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNor

## Fine-tuning

### Evaluation scheme using seqeval

To integrate the metrics from seqeval into training, we can create a function to process outputs from the model into predictions, and ignore labels from subwords.

In [None]:
import evaluate

metric = evaluate.load("seqeval")

In [None]:
example_text

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [None]:
classes

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
labels = [classes[i] for i in example_text['ner_tags']]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [None]:
metric.compute(predictions=[labels], references=[labels])

{'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [None]:
def compute_metrics(eval_preds):
    """
    Function to compute the evaluation metrics for Named Entity Recognition (NER) tasks.
    The function computes precision, recall, F1 score and accuracy.

    Parameters:
    eval_preds (tuple): A tuple containing the predicted logits and the true labels.

    Returns:
    A dictionary containing the precision, recall, F1 score and accuracy.
    """
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [classes[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
        [classes[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)
    return {
    "precision": results["overall_precision"],
    "recall": results["overall_recall"],
    "f1": results["overall_f1"],
    "accuracy": results["overall_accuracy"],
  }

### Training

We fine tune the model for 3 epochs

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    "test-ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    )

data_collator = DataCollatorForTokenClassification(tokenizer)

from transformers import Trainer

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.016,0.068162,0.942761,0.939702,0.941229,0.985639
2,0.0206,0.060984,0.939307,0.948764,0.944012,0.986576
3,0.0081,0.064329,0.943488,0.950666,0.947063,0.987164


TrainOutput(global_step=2634, training_loss=0.014725454440149468, metrics={'train_runtime': 485.9646, 'train_samples_per_second': 86.679, 'train_steps_per_second': 5.42, 'total_flos': 1021316467278600.0, 'train_loss': 0.014725454440149468, 'epoch': 3.0})

Save the model

In [None]:
model.save_pretrained("ner_model")
tokenizer.save_pretrained("tokenizer")



('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

Need to save the mapping as well

In [None]:
import json

id2label = {
    str(i): label for i,label in enumerate(classes)
}
label2id = {
    label: str(i) for i,label in enumerate(classes)
}

config = json.load(open("ner_model/config.json"))

config["id2label"] = id2label
config["label2id"] = label2id

json.dump(config, open("ner_model/config.json","w"))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !cp -r /content/ner_model '/content/drive/MyDrive/Colab Notebooks/NER'
# !cp -r /content/tokenizer '/content/drive/MyDrive/Colab Notebooks/NER'

Reload the model

In [None]:
os.chdir('/content/drive/MyDrive/Colab Notebooks/NER')

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("tokenizer")
model = AutoModelForTokenClassification.from_pretrained("ner_model")

We can make a NER dedicated pipeline out of the finetuned model.

In [None]:
from transformers import pipeline

ner = pipeline("ner", model=model, tokenizer=tokenizer)

example = "Melissa works at the United Nations in Geneva"

ner_results = ner(example)

print(ner_results)



[{'entity': 'B-PER', 'score': 0.99661416, 'index': 1, 'word': 'melissa', 'start': 0, 'end': 7}, {'entity': 'B-ORG', 'score': 0.9990151, 'index': 5, 'word': 'united', 'start': 21, 'end': 27}, {'entity': 'I-ORG', 'score': 0.9986677, 'index': 6, 'word': 'nations', 'start': 28, 'end': 35}, {'entity': 'B-LOC', 'score': 0.99918383, 'index': 8, 'word': 'geneva', 'start': 39, 'end': 45}]


### Evaluate model on test set

We update the above compute_metrics function to return all metrics for each class.

In [None]:
def _compute_metrics(eval_preds):
    """
    Function to compute the evaluation metrics for Named Entity Recognition (NER) tasks.
    The function computes precision, recall, F1 score and accuracy.

    Parameters:
    eval_preds (tuple): A tuple containing the predicted logits and the true labels.

    Returns:
    A dictionary containing the precision, recall, F1 score and accuracy.
    """
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [classes[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
        [classes[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)
    return results

In [None]:
trainer.compute_metrics = _compute_metrics

In [None]:
test_results = trainer.predict(tokenized_datasets['test'])

In [None]:
res = test_results[-1]
res

{'test_loss': 0.15396511554718018,
 'test_LOC': {'precision': 0.9095084979329352,
  'recall': 0.9322033898305084,
  'f1': 0.9207161125319693,
  'number': 2124},
 'test_MISC': {'precision': 0.7587939698492462,
  'recall': 0.7580321285140562,
  'f1': 0.7584128578603716,
  'number': 996},
 'test_ORG': {'precision': 0.8735371838429596,
  'recall': 0.8941267387944358,
  'f1': 0.8837120488829482,
  'number': 2588},
 'test_PER': {'precision': 0.9751786385859346,
  'recall': 0.9540103016924208,
  'f1': 0.9644783336432955,
  'number': 2718},
 'test_overall_precision': 0.9011792452830188,
 'test_overall_recall': 0.9069546641348208,
 'test_overall_f1': 0.904057730983083,
 'test_overall_accuracy': 0.9766763345072854,
 'test_runtime': 9.4122,
 'test_samples_per_second': 366.866,
 'test_steps_per_second': 22.949}

### Observation

We can already observe here a substantial improvement in the scores for each class.