# NLP Exercise 6: Named Entity Recognition (NER)
---

Part-of-speech tagging (POS): mark each word in a sentence as corresponding to a particular part of speech.

- O: the word does not correspond to any entity.
- B-PER/I-PER: corresponds to the begginning/inside a person entity.
- B-ORG/I-ORG: corresponds to the begginning/inside an organization entity.
- B-LOC/I-LOC: corresponds to the begginning/inside a location entity.
- B-MISC/I-MISC: corresponds to the begginning/inside a miscellaneous entity.

You can get more information about the dataset we used below in the link:
https://huggingface.co/datasets/eriktks/conll2003

## Import libraries

In [31]:
import datasets
import numpy as np
import evaluate
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments

## Understanding the data

In [2]:
# Load Datasets
ner_dataset = datasets.load_dataset('conll2003', trust_remote_code=True)

In [3]:
ner_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
ner_dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [5]:
# Define tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [6]:
# Tokenize a sample to see the output
text = ner_dataset['train'][0]
tokenized_text = tokenizer(text['tokens'], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_text['input_ids'])
word_ids = tokenized_text.word_ids()

print(f"{tokenized_text} \n")
print(f"{tokens} \n")
print(word_ids)

{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} 

['[CLS]', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '[SEP]'] 

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]


## Assigning Labels to Tokens


In [19]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
        # Special tokens like `` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [20]:
# Apply the function to the entire dataset
tokenized_dataset = ner_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map: 100%|██████████| 14041/14041 [00:02<00:00, 6381.51 examples/s]
Map: 100%|██████████| 3250/3250 [00:00<00:00, 9060.94 examples/s] 
Map: 100%|██████████| 3453/3453 [00:00<00:00, 7031.24 examples/s]


In [21]:
tokenized_dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'input_ids': [101,
  7327,
  19164,
  2446,
  2655,
  2000,
  17757,
  2329,
  12559,
  1012,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]}

## Define the model and training arguements

In [40]:
# Labels list
label_list = ner_dataset["train"].features["ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [41]:
# Model
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
# Training Args
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = "epoch",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    
)

# Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer)


In [55]:
# Initialize the Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator
)
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 