# Install Transformers and Datasets from Hugging Face

In [1]:
# Transformers installation
! pip install transformers[torch] datasets
# ! pip install git+https://github.com/huggingface/transformers.git

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Us

# NER as Token classification

Token classification involves assigning labels to individual tokens in a sentence. A common task in this area is Named Entity Recognition (NER), which identifies and categorizes entities such as people, locations, or organizations within a sentence. The BERT model, specifically the `bert-base-NER`, has been fine-tuned for this purpose. It recognizes four types of entities: Location (LOC), Organization (ORG), Person (PER), and Miscellaneous (MISC), using the English version of the CoNLL-2003 dataset as its training data.

Recognition dataset: https://www.aclweb.org/anthology/W03-0419.pdf



# Load the Model and Tokenizer from bert-base-NER

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Create a Pipeline from the bert-base-NER Model and Tokenizer

In [3]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


# Prepare a Text

In [4]:
text = "Apple Inc. plans to open a new store in San Francisco by January 2024. Tim Cook, the CEO, announced the news yesterday."

# Label Tokens with the Tags in the B-I-O Scheme

In [5]:
ner_results = nlp(text)
print(ner_results)

[{'entity': 'B-ORG', 'score': 0.9996086, 'index': 1, 'word': 'Apple', 'start': 0, 'end': 5}, {'entity': 'I-ORG', 'score': 0.99942136, 'index': 2, 'word': 'Inc', 'start': 6, 'end': 9}, {'entity': 'B-LOC', 'score': 0.99934715, 'index': 11, 'word': 'San', 'start': 40, 'end': 43}, {'entity': 'I-LOC', 'score': 0.99942625, 'index': 12, 'word': 'Francisco', 'start': 44, 'end': 53}, {'entity': 'B-PER', 'score': 0.9997869, 'index': 18, 'word': 'Tim', 'start': 71, 'end': 74}, {'entity': 'I-PER', 'score': 0.99977297, 'index': 19, 'word': 'Cook', 'start': 75, 'end': 79}]


# Extract the Named Entities

In [6]:
# The code below presumes that ner_results is a list of dictionaries, each representing a token,
# arranged in the sequence they appeared in the source sentence.
organized_results = {'LOC': [], 'PER': [], 'ORG': [], 'MISC': []}

current_entity = None
current_words = []

for result in ner_results:
    entity_type = result['entity'].split('-')[1]
    if result['entity'].startswith('B-'):
        if current_entity:
            organized_results[current_entity].append(' '.join(current_words))
        current_entity = entity_type
        current_words = [result['word']]
    elif result['entity'].startswith('I-') and current_entity == entity_type:
        current_words.append(result['word'])

# Handle the last entity
if current_entity:
    organized_results[current_entity].append(' '.join(current_words))

# Remove hash symbols from words
for key, value in organized_results.items():
    organized_results[key] = [' '.join(word.split('##')) for word in value]

print(organized_results)


{'LOC': ['San Francisco'], 'PER': ['Tim Cook'], 'ORG': ['Apple Inc'], 'MISC': []}


# Generate a List of Tokens and the Corresponding List of Entity Tags

In [7]:
token_list = []
tag_list = []
for result in ner_results:
    token_list.append(result['word'])
    tag_list.append(result['entity'])

In [8]:
token_list, tag_list

(['Apple', 'Inc', 'San', 'Francisco', 'Tim', 'Cook'],
 ['B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-PER', 'I-PER'])

# Let Us Test the Model on the CoNLL2003 Data


Start by loading the CoNLL2003 dataset from the Datasets library:

In [9]:
from datasets import load_dataset

conll = load_dataset("conll2003")

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

The dataset has been split into train, test, and validation sets:

In [10]:
conll

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

Get the features in the datasets:

In [11]:
conll['test'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

Get the list of tag names:

In [12]:
tag_names = conll["test"].features[f"ner_tags"].feature.names
tag_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

The letter that prefixes each `ner_tag` indicates the token position of the entity:

- `B-` indicates the beginning of an entity.
- `I-` indicates a token is contained inside the same entity (for example, the `State` token is a part of an entity like
  `Empire State Building`).
- `0` indicates the token doesn't correspond to any entity.

## Test the Model on a Test Data

Use the instance of index 12 in the test dataset as an example:

In [13]:
example = conll['test'][12]
for key in example:
    print(key, ":", example[key])

id : 12
tokens : ['Defender', 'Hassan', 'Abbas', 'rose', 'to', 'intercept', 'a', 'long', 'ball', 'into', 'the', 'area', 'in', 'the', '84th', 'minute', 'but', 'only', 'managed', 'to', 'divert', 'it', 'into', 'the', 'top', 'corner', 'of', 'Bitar', "'s", 'goal', '.']
pos_tags : [22, 22, 22, 38, 35, 37, 12, 16, 21, 15, 12, 21, 15, 12, 16, 21, 10, 30, 38, 35, 37, 28, 15, 12, 16, 21, 15, 21, 27, 21, 7]
chunk_tags : [11, 12, 12, 21, 22, 22, 11, 12, 12, 13, 11, 12, 13, 11, 12, 12, 0, 3, 21, 22, 22, 11, 13, 11, 12, 12, 13, 11, 11, 12, 0]
ner_tags : [0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]


Convert the tag ids to tag names to see what entities are recognized:

In [14]:
example_entities = [tag_names[i] for i in example['ner_tags']]

In [15]:
for idx, w in enumerate(example['tokens']):
    print(idx, w, ":", example_entities[idx])

0 Defender : O
1 Hassan : B-PER
2 Abbas : I-PER
3 rose : O
4 to : O
5 intercept : O
6 a : O
7 long : O
8 ball : O
9 into : O
10 the : O
11 area : O
12 in : O
13 the : O
14 84th : O
15 minute : O
16 but : O
17 only : O
18 managed : O
19 to : O
20 divert : O
21 it : O
22 into : O
23 the : O
24 top : O
25 corner : O
26 of : O
27 Bitar : B-PER
28 's : O
29 goal : O
30 . : O


What is the number of original tokens in the given data?

In [16]:
len(example_entities)

31

Tokenize the input by the tokenizer. Set `is_split_into_words=True` so that the given list of tokens can be processed correctly:

In [17]:
tokenized_input = tokenizer(example['tokens'], is_split_into_words=True)
tokenized_input

{'input_ids': [101, 3177, 27896, 13583, 19166, 3152, 1106, 22205, 170, 1263, 3240, 1154, 1103, 1298, 1107, 1103, 5731, 1582, 2517, 1133, 1178, 2374, 1106, 23448, 1204, 1122, 1154, 1103, 1499, 2655, 1104, 27400, 1813, 112, 188, 2273, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

List the resultant tokens after the tokenization:

In [18]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
tokens

['[CLS]',
 'De',
 '##fender',
 'Hassan',
 'Abbas',
 'rose',
 'to',
 'intercept',
 'a',
 'long',
 'ball',
 'into',
 'the',
 'area',
 'in',
 'the',
 '84',
 '##th',
 'minute',
 'but',
 'only',
 'managed',
 'to',
 'diver',
 '##t',
 'it',
 'into',
 'the',
 'top',
 'corner',
 'of',
 'Bit',
 '##ar',
 "'",
 's',
 'goal',
 '.',
 '[SEP]']

What is the number of tokens generated by the tokenizer?

In [19]:
len(tokens)

38

You see there is a mismatch between the result of the tokenization and the given list of tokens. For evaluating the model's performance against the given tags, we need to realign the tokenization result with the given list of tags. Let's see whether the pipeline() could correctly handle the alignment.

Classify the tokens into recognized entities:

In [20]:
ner_results = nlp(example['tokens'])
ner_results

[[],
 [{'entity': 'B-PER',
   'score': 0.98705703,
   'index': 1,
   'word': 'Hassan',
   'start': 0,
   'end': 6}],
 [{'entity': 'B-PER',
   'score': 0.9990717,
   'index': 1,
   'word': 'Abbas',
   'start': 0,
   'end': 5}],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [{'entity': 'B-ORG',
   'score': 0.93285,
   'index': 1,
   'word': 'Bit',
   'start': 0,
   'end': 3},
  {'entity': 'I-ORG',
   'score': 0.5807369,
   'index': 2,
   'word': '##ar',
   'start': 3,
   'end': 5}],
 [],
 [],
 []]

What is the length of the classification result?

In [21]:
len(ner_results)

31

Wonderful! It seems that the classification by pipeline() took care of the tokenization results. Subword tokens were grouped as a single unit if the tokens came from the same word. Now, we can retrieve the list of predictions by using the prediction of the first token in each group.

In [22]:
predictions = []
for result in ner_results:
    if len(result) == 0:
        predictions.append('O')
    else:
        predictions.append(result[0]['entity'])

In [23]:
print(predictions)

['O', 'B-PER', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O']


length of the predictions?

In [24]:
len(predictions)

31

## Apply the Model to All Test Data

In [25]:
from tqdm import tqdm

In [26]:
# use the test dataset
test = conll['test']

In [27]:
from tqdm import tqdm

true_tags_list = []
predicted_tags_list = []
count = 0 # for test purpose
for atest in tqdm(test, desc=str(len(test))):
    if count < len(test) + 1:
        # add true labels to references
        true_tags_list.append([tag_names[id] for id in atest['ner_tags']])

        # recognize named entity in a test tokens
        test_ner_results = nlp(atest['tokens'])

        predicted_tags = []
        # extract the predicted tags
        for result in test_ner_results:
            if len(result) == 0:
                predicted_tags.append('O')
            else:
                predicted_tags.append(result[0]['entity'])

        predicted_tags_list.append(predicted_tags)
    count += 1

3453: 100%|██████████| 3453/3453 [48:12<00:00,  1.19it/s]


In [28]:
len(predicted_tags_list), len(true_tags_list)

(3453, 3453)

## Check the predictions match the true tags

In [29]:
flag = True
for idx, apredi in enumerate(predicted_tags_list):
    if len(apredi) != len(true_tags_list[idx]):
        flag = False
        print(idx, ":", False)
if flag:
    print(True)

True


## Evaluate

Load a evaluation method with the Huggingface [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [seqeval](https://huggingface.co/spaces/evaluate-metric/seqeval) framework (see the Huggingface Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) about how to load and compute a metric). Seqeval actually produces several scores: precision, recall, F1, and accuracy.

In [30]:
! pip install -q evaluate seqeval

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [31]:
import evaluate

seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

Apply the seqeval to the predicted tags and true tags:

In [32]:
results = seqeval.compute(predictions=predicted_tags_list, references=true_tags_list)

print("precision:", results["overall_precision"]),
print("recall:", results["overall_recall"]),
print("f1:", results["overall_f1"]),
print("accuracy:", results["overall_accuracy"])

precision: 0.3569140074330386
recall: 0.49309490084985835
f1: 0.4140956062746264
accuracy: 0.9066867664477226
