In [1]:
pip install transformers datasets evaluate seqeval

Note: you may need to restart the kernel to use updated packages.


# Task

Token classification assigns a label to individual tokens in a sentence. One of the most common token classification tasks is Named Entity Recognition (NER). NER attempts to find a label for each entity in a sentence, such as a person, location, or organization.

This notebook shows how to:
1. Finetune DistilBERT on the WNUT 17 dataset to detect new entities.
2. Use the finetuned model for inference.

# Libraries

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorForTokenClassification

# Data

In [3]:
# Load the WNUT 17 dataset from the ðŸ¤— Datasets library
wnut = load_dataset("wnut_17")

In [4]:
# Check out an example
wnut["train"][0]

{'id': '0',
 'tokens': ['@paulwalk',
  'It',
  "'s",
  'the',
  'view',
  'from',
  'where',
  'I',
  "'m",
  'living',
  'for',
  'two',
  'weeks',
  '.',
  'Empire',
  'State',
  'Building',
  '=',
  'ESB',
  '.',
  'Pretty',
  'bad',
  'storm',
  'here',
  'last',
  'evening',
  '.'],
 'ner_tags': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  8,
  8,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [5]:
tokens_list = wnut["train"][0]['tokens']
ner_tags_list = wnut["train"][0]['ner_tags']
print(tuple(zip(tokens_list, ner_tags_list)))

(('@paulwalk', 0), ('It', 0), ("'s", 0), ('the', 0), ('view', 0), ('from', 0), ('where', 0), ('I', 0), ("'m", 0), ('living', 0), ('for', 0), ('two', 0), ('weeks', 0), ('.', 0), ('Empire', 7), ('State', 8), ('Building', 8), ('=', 0), ('ESB', 7), ('.', 0), ('Pretty', 0), ('bad', 0), ('storm', 0), ('here', 0), ('last', 0), ('evening', 0), ('.', 0))


In [6]:
# Each number in ner_tags represents an entity
# Convert the numbers to their label names to find out what the entities are
# NB: The letter that prefixes each ner_tag indicates the token position of the entity:
# B - indicates the beginning of an entity.
# I - token is contained inside the same entity (e.g. State token is a part of an entity like Empire State Building).
# 0 indicates the token doesnâ€™t correspond to any entity
label_list = wnut["train"].features[f"ner_tags"].feature.names
label_list

['O',
 'B-corporation',
 'I-corporation',
 'B-creative-work',
 'I-creative-work',
 'B-group',
 'I-group',
 'B-location',
 'I-location',
 'B-person',
 'I-person',
 'B-product',
 'I-product']

# Preprocessing

In [7]:
# Load a DistilBERT tokenizer to preprocess the tokens field
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [8]:
# From the example tokens field above, it looks like the input has already been tokenized...
# But the input actually hasnâ€™t been tokenized yet
# Weâ€™ll need to set is_split_into_words=True to tokenize the words into subwords
example = wnut["train"][0]
example_tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
example_tokens = tokenizer.convert_ids_to_tokens(example_tokenized_input["input_ids"])
example_tokens

['[CLS]',
 '@',
 'paul',
 '##walk',
 'it',
 "'",
 's',
 'the',
 'view',
 'from',
 'where',
 'i',
 "'",
 'm',
 'living',
 'for',
 'two',
 'weeks',
 '.',
 'empire',
 'state',
 'building',
 '=',
 'es',
 '##b',
 '.',
 'pretty',
 'bad',
 'storm',
 'here',
 'last',
 'evening',
 '.',
 '[SEP]']

In [None]:
# Note that the tokenisation process above introduces special characters such as [SEP] and [CLS]
# Issue: this creates a mismatch between the input and labels. 
# Need to realign the tokens and labels

def tokenize_and_align_labels(examples):
    # Map all tokens to their corresponding word with the word_ids method
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    # Assign the label -100 to the special tokens [CLS] and [SEP]
    # This causes them to be ignored by the PyTorch loss function (CrossEntropyLoss).
    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# apply the preprocessing function over the entire dataset
# speed up the map function by setting batched=True to process multiple elements of the dataset at once
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)

In [None]:
# create a batch of examples using DataCollatorWithPadding
# Itâ€™s more efficient to dynamically pad the sentences to the longest length in a batch during collation
# Avoid pre-padding the whole dataset to the maximum length
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)