In [1]:
!pip install transformers huggingface torch datasets accelerate bitsandbytes accelerate seqeval > /dev/null

The major token classification tasks are:

NER (Named-entity recognition) Classify the entities in the text (person, organization, location...).

POS (Part-of-speech tagging) Grammatically classify the tokens (noun, verb, adjective...)

Chunk (Chunking) Grammatically classify the tokens and group them into "chunks" that go together

In [2]:
task = "ner"
model_cp = "distilbert-base-uncased"
batch_size = 16

In [3]:
from datasets import load_dataset, load_metric
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [4]:
conll = load_dataset("conll2003")
conll

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/283k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
conll['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [None]:
conll['train'].features['pos_tags']

Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None)

In [None]:
conll['train'].features['chunk_tags']

Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None)

In [None]:
conll['train'].features['ner_tags']

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [5]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset)
    picks = []

    for _ in range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        picks.append(pick)
    # print(picks)

    df = pd.DataFrame(dataset[picks])
    print(df.columns)
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names(i))
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(lambda x: [typ.feature.names[i] for i in x])

    display(HTML(df.to_html()))

In [None]:
show_random_elements(conll['train'])

Index(['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'], dtype='object')


Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,8765,"[But, the, spokesman, said, subsequent, tests, in, Cologne, proved, his, body, produced, higher-than-average, testosterone, levels, naturally, .]","[CC, DT, NN, VBD, JJ, NNS, IN, NNP, VBD, PRP$, NN, VBD, JJ, NN, NNS, RB, .]","[O, B-NP, I-NP, B-VP, B-NP, I-NP, B-PP, B-NP, B-VP, B-NP, I-NP, B-VP, B-NP, I-NP, I-NP, B-ADVP, O]","[O, O, O, O, O, O, O, B-LOC, O, O, O, O, O, O, O, O, O]"
1,12360,"[6., Federico, Colonna, (, Italy, ), Mapei, 27]","[CD, NNP, NNP, (, NNP, ), NNP, CD]","[B-NP, B-NP, I-NP, O, B-NP, O, B-NP, I-NP]","[O, B-PER, I-PER, O, B-LOC, O, B-ORG, O]"
2,3700,"[Hereford, 1, Doncaster, 0]","[VBN, CD, NNP, CD]","[B-VP, B-NP, I-NP, I-NP]","[B-ORG, O, B-ORG, O]"
3,4467,"[One, feature, of, the, Java, language, is, that, small, software, programmes, ,, known, as, "", applets, "", because, they, are, small, applications, ,, can, be, downloaded, from, the, server, computers, at, the, centre, of, networks, onto, individual, computers, for, use, .]","[CD, NN, IN, DT, NNP, NN, VBZ, DT, JJ, NN, NNS, ,, VBN, IN, "", NNS, "", IN, PRP, VBP, JJ, NNS, ,, MD, VB, VBN, IN, DT, NN, NNS, IN, DT, NN, IN, NNS, IN, JJ, NNS, IN, NN, .]","[B-NP, I-NP, B-PP, B-NP, I-NP, I-NP, B-VP, B-NP, I-NP, I-NP, I-NP, O, B-VP, B-PP, O, B-NP, O, B-SBAR, B-NP, B-VP, B-NP, I-NP, O, B-VP, I-VP, I-VP, B-PP, B-NP, I-NP, I-NP, B-PP, B-NP, I-NP, B-PP, B-NP, B-PP, B-NP, I-NP, B-PP, B-NP, O]","[O, O, O, O, B-MISC, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,6564,"[Sunday, .]","[NNP, .]","[B-NP, O]","[O, O]"


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_cp)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
# We have to tokenize already split words in a datapoint

tokenizer(["Hello", "this", "is", "a", "single", "sentence", "split", "into", "words"],
         is_split_into_words=True, return_tensors='pt')

{'input_ids': tensor([[ 101, 7592, 2023, 2003, 1037, 2309, 6251, 3975, 2046, 2616,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [9]:
take_example = conll['train'][10]
take_example.items()

dict_items([('id', '10'), ('tokens', ['Spanish', 'Farm', 'Minister', 'Loyola', 'de', 'Palacio', 'had', 'earlier', 'accused', 'Fischler', 'at', 'an', 'EU', 'farm', 'ministers', "'", 'meeting', 'of', 'causing', 'unjustified', 'alarm', 'through', '"', 'dangerous', 'generalisation', '.', '"']), ('pos_tags', [22, 22, 22, 22, 22, 22, 38, 31, 40, 22, 15, 12, 16, 21, 24, 27, 21, 15, 39, 16, 21, 15, 0, 16, 21, 7, 0]), ('chunk_tags', [11, 12, 12, 12, 12, 12, 21, 22, 22, 11, 13, 11, 12, 12, 12, 11, 12, 13, 21, 1, 11, 13, 0, 11, 12, 0, 0]), ('ner_tags', [7, 0, 0, 1, 2, 2, 0, 0, 0, 1, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])])

In [11]:
tokenized_input = tokenizer(take_example['tokens'], is_split_into_words=True)

tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])

In [13]:
from rich import print

print(tokens)

In [16]:
# The details are required in order to label the
# tokens with appropriate tag_numbers
print(tokenized_input.word_ids())

In [17]:
label_all_tokens = True

# Ask why we are aligning the labels?
# Once understood, then implementing same function for new datasets
# will become easier

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"],
                                 truncation=True,
                                 is_split_into_words=True)

    labels = []

    for i, label in enumerate(examples[f"{task}_tags"]):
        # each "label" will contain list of task specific ids, that
        # are used below. Observe the enumerate and think why it is
        # used
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None

        label_ids = []

        for word_idx in word_ids:
            # Special tokens have a word id that is None.
            # We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label
            # to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

In [18]:
# testing out the above function, think why [:5] is used

aligned_datapoints = tokenize_and_align_labels(conll['train'][:5])

In [19]:
# Map the above function to conll dataset
# Above all introspection will be answered below
conll_aligned = conll.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [22]:
label_list = conll['train'].features[f"ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [23]:
# num_labels have to match with the num of different labels that we
# are targeting to classify. (Ref label_list above)

model = AutoModelForTokenClassification.from_pretrained(model_cp,
                                                        num_labels=len(label_list))



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
from transformers import Trainer, TrainingArguments

batch_size = 8

args = TrainingArguments(
    f"{model_cp}-ner",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    report_to="none",
    skip_memory_metrics=True
)

Get a data collator that will batch our processed examples together while applying padding to make them all the same size (each pad will be padded to the length of its longest example).

In [28]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
metric = load_metric("seqeval")
metric

In [30]:
predictions = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
references = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
results = metric.compute(predictions=predictions, references=references)

In [31]:
results

{'MISC': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'PER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 0.5,
 'overall_recall': 0.5,
 'overall_f1': 0.5,
 'overall_accuracy': 0.8}

In [32]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]  # get the predicted label for each token in each datapoint
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]  # get the actual label for each token in each datapoint

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [33]:
trainer = Trainer(
    model,
    args,
    train_dataset=conll_aligned['train'],
    eval_dataset=conll_aligned['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results