In [1]:
#!pip install datasets

In [77]:
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments
from transformers import pipeline
from transformers import Trainer
from datasets import load_dataset
from huggingface_hub import notebook_login
import evaluate
import numpy as np



In [4]:
dataset = load_dataset("conll2003")

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [6]:
dataset["train"][0]["tokens"]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [7]:
dataset["train"][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

Those are the labels as integers ready for training, but they’re not necessarily useful when we want to inspect the data. Like for text classification, we can access the correspondence between those integers and the label names by looking at the features attribute of our dataset:

In [8]:
ner_feature = dataset["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

So this column contains elements that are sequences of ClassLabels. The type of the elements of the sequence is in the feature attribute of this ner_feature, and we can access the list of names by looking at the names attribute of that feature:

In [9]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

O means the word doesn’t correspond to any entity.

B-PER/I-PER means the word corresponds to the beginning of/is inside a person entity.

B-ORG/I-ORG means the word corresponds to the beginning of/is inside an organization entity.

B-LOC/I-LOC means the word corresponds to the beginning of/is inside a location entity.

B-MISC/I-MISC means the word corresponds to the beginning of/is inside a miscellaneous entity like nationality, event, product, work of art etc

Now decoding the labels saw earlier gives this:

In [10]:
words = dataset["train"][0]["tokens"]
labels = dataset["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

EU    rejects German call to boycott British lamb . 
B-ORG O       B-MISC O    O  O       B-MISC  O    O 


_Texts need to be converted to token IDs before the model can make sense of them._

_I just need to warn the tokenizer with a special flag._

In [11]:
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [12]:
tokenizer.is_fast

True

In [13]:
#To tokenize a pre-tokenized input, we can use our tokenizer as usual and just add is_split_into_words=True:

inputs = tokenizer(dataset["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens() #The word lamb, however, was tokenized into two subwords, la and ##mb.

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [14]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

*With a tiny bit of work, I can then expand our label list to match the tokens. The first rule I’ll apply is that special tokens get a label of -100. This is because by default -100 is an index that is ignored in the loss function we will use (cross entropy). Then, each token gets the same label as the token that started the word it’s inside, since they are part of the same entity. For tokens inside a word but not at the beginning, I replace the B- with I- (since the token does not begin the entity):*

In [45]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [16]:
labels = dataset["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
#print(dataset["train"][0]["tokens"])
labels
#print(inputs.tokens())


[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [17]:
align_labels_with_tokens(labels, word_ids)

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]

*As I can see, my function added the -100 for the two special tokens at the beginning and the end, and a new 0 for our word that was split into two tokens.*

Below is the function that processes a list of examples and use the Dataset.map() method with the option batched=True. The only thing that is different from my previous example is that the word_ids() function needs to get the index of the example I want the word IDs of when the inputs to the tokenizer are lists of texts (or in my case, list of lists of words), so I add that too:

In [46]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

I can now apply all that preprocessing in one go on the other splits of our dataset:

In [19]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

## Fine-tuning the model with the Trainer API

### Data collation

Here my labels should be padded the exact same way as the inputs so that they stay the same size, using -100 as a value so that the corresponding predictions are ignored in the loss computation.

This is all done by a DataCollatorForTokenClassification. Like the DataCollatorWithPadding, it takes the tokenizer used to preprocess the inputs:

In [20]:
#!pip install --upgrade numpy


In [21]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)




To test this on a few samples, I can just call it on a list of examples from our tokenized training set:

In [22]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

Let’s compare this to the labels for the first and second elements in our dataset:

In [23]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]
[-100, 1, 2, -100]


As I can see, the second set of labels has been padded to the length of the first one using -100s.

### Metrics

To have the Trainer compute a metric every epoch, I will need to define a compute_metrics() function that takes the arrays of predictions and labels, and returns a dictionary with the metric names and values.

I first need to install the seqeval library:

In [24]:
#!pip install seqeval

In [25]:
#!pip install evaluate


In [21]:
#!pip install --upgrade tensorflow


In [22]:
#pip install --upgrade transformers evaluate tensorflow


In [24]:
## pip install --upgrade tensorflow tensorflow-probability transformers evaluate


Collecting tensorflow-probability
  Downloading tensorflow_probability-0.23.0-py2.py3-none-any.whl (6.9 MB)
     ---------------------------------------- 6.9/6.9 MB 995.3 kB/s eta 0:00:00
Installing collected packages: tensorflow-probability
  Attempting uninstall: tensorflow-probability
    Found existing installation: tensorflow-probability 0.20.1
    Uninstalling tensorflow-probability-0.20.1:
      Successfully uninstalled tensorflow-probability-0.20.1
Successfully installed tensorflow-probability-0.23.0
Note: you may need to restart the kernel to use updated packages.




In [26]:
metric = evaluate.load("seqeval")





This metric does not behave like the standard accuracy: it will actually take the lists of labels as strings, not integers, so I will need to fully decode the predictions and labels before passing them to the metric. Let’s see how it works. First, I’ll get the labels for my first training example:

In [27]:
labels = dataset["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

We can then create fake predictions for those by just changing the value at index 2:

In [28]:
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])
#Note that the metric takes a list of predictions (not just one) and a list of labels. Here’s the output:

{'MISC': {'precision': 1.0,
  'recall': 0.5,
  'f1': 0.6666666666666666,
  'number': 2},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.8,
 'overall_accuracy': 0.8888888888888888}

This is sending back a lot of information! I get the precision, recall, and F1 score for each separate entity, as well as overall. For our metric computation I will only keep the overall score.

This compute_metrics() function first takes the argmax of the logits to convert them to predictions (as usual, the logits and the probabilities are in the same order, so I don’t need to apply the softmax). Then I have to convert both labels and predictions from integers to strings. I remove all the values where the label is -100, then pass the results to the metric.compute() method:

In [47]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

Now that this is done, I am almost ready to define my Trainer. I just need a model to fine-tune!

### Defining the model

Since I am working on a token classification problem, I will use the AutoModelForTokenClassification class. The main thing to remember when defining this model is to pass along some information on the number of labels I have. The easiest way to do this is to pass that number with the num_labels argument, but if I want a nice inference widget, it’s better to set the correct label correspondences instead.

They should be set by two dictionaries, id2label and label2id, which contain the mappings from ID to label and vice versa:

In [30]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [31]:
label2id

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'B-LOC': 5,
 'I-LOC': 6,
 'B-MISC': 7,
 'I-MISC': 8}

In [32]:
id2label

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'B-LOC',
 6: 'I-LOC',
 7: 'B-MISC',
 8: 'I-MISC'}

Now I can just pass them to the AutoModelForTokenClassification.from_pretrained() method, and they will be set in the model’s configuration and then properly saved and uploaded to the Hub:

In [33]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Let’s double-check that our model has the right number of labels:

In [34]:
model.config.num_labels

9

### Fine-tuning the model 

I am now ready to train our model! I just need to do two last things before we define our Trainer: log in to Hugging Face and define our training arguments. 

In [39]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

I specify push_to_hub=True to indicate that we want to save the model and evaluate it at the end of every epoch, and that I want to upload our results to the Model Hub. 

In [35]:
args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

Finally, I just pass everything to the Trainer and launch the training:

In [41]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0781,0.080577,0.915631,0.929653,0.922589,0.979602
2,0.0361,0.060043,0.926708,0.9448,0.935667,0.985342
3,0.0189,0.060809,0.930574,0.94968,0.94003,0.986166


TrainOutput(global_step=5268, training_loss=0.06731020526263237, metrics={'train_runtime': 15682.6122, 'train_samples_per_second': 2.686, 'train_steps_per_second': 0.336, 'total_flos': 923635312577460.0, 'train_loss': 0.06731020526263237, 'epoch': 3.0})

Note that while the training happens, each time the model is saved (here, every epoch) it is uploaded to the Hub in the background. 

Once the training is complete, I use the push_to_hub() method to make sure we upload the most recent version of the model:

In [42]:
trainer.push_to_hub(commit_message="Training complete")

'https://huggingface.co/muhammadahmad2622/bert-finetuned-ner/tree/main/'

### Using the fine-tuned model

To use the model locally in a pipeline, I have to specify the proper model identifier:

In [36]:
model_checkpoint='muhammadahmad2622/bert-finetuned-ner'
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("My name is Ahmed and I work at Unice in Nice.")

[{'entity_group': 'PER',
  'score': 0.9989286,
  'word': 'Ahmed',
  'start': 11,
  'end': 16},
 {'entity_group': 'ORG',
  'score': 0.9950421,
  'word': 'Unice',
  'start': 31,
  'end': 36},
 {'entity_group': 'LOC',
  'score': 0.9986689,
  'word': 'Nice',
  'start': 40,
  'end': 44}]

In [37]:
token_classifier("My name is Muhammad Ahmed and I work at Unice in Nice Sophia.")

[{'entity_group': 'PER',
  'score': 0.9992379,
  'word': 'Muhammad Ahmed',
  'start': 11,
  'end': 25},
 {'entity_group': 'ORG',
  'score': 0.9923763,
  'word': 'Unice',
  'start': 40,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.90186274,
  'word': 'Nice Sophia',
  'start': 49,
  'end': 60}]