<a href="https://colab.research.google.com/github/nguyenducminh2206/NLP-TampereUniversity/blob/main/Week_6/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Exercise 6: Named Entity Recognition (NER)
---

Part-of-speech tagging (POS): mark each word in a sentence as corresponding to a particular part of speech.

- O: the word does not correspond to any entity.
- B-PER/I-PER: corresponds to the begginning/inside a person entity.
- B-ORG/I-ORG: corresponds to the begginning/inside an organization entity.
- B-LOC/I-LOC: corresponds to the begginning/inside a location entity.
- B-MISC/I-MISC: corresponds to the begginning/inside a miscellaneous entity.

You can get more information about the dataset we used below in the link:
https://huggingface.co/datasets/eriktks/conll2003

In [None]:
!pip install transformers datasets evaluate

## Import libraries

In [21]:
import datasets
import numpy as np
import evaluate
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments

## Understanding the data

In [22]:
# Load Datasets
ner_dataset = datasets.load_dataset('conll2003', trust_remote_code=True)

In [23]:
ner_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [24]:
ner_dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [25]:
# Define tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [26]:
# Tokenize a sample to see the output
text = ner_dataset['train'][0]
tokenized_text = tokenizer(text['tokens'], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_text['input_ids'])
word_ids = tokenized_text.word_ids()

print(f"{tokenized_text} \n")
print(f"{tokens} \n")
print(word_ids)

{'input_ids': [101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} 

['[CLS]', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '[SEP]'] 

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]


## Assigning Labels to Tokens


In [27]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
        # Special tokens like `` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [28]:
# Apply the function to the entire dataset
tokenized_dataset = ner_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

In [29]:
tokenized_dataset['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],
 'input_ids': [101,
  7327,
  19164,
  2446,
  2655,
  2000,
  17757,
  2329,
  12559,
  1012,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]}

## Define the model and training arguements

In [30]:
# Labels list
label_list = ner_dataset["train"].features["ner_tags"].feature.names
label_list

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [31]:
# Model
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_list))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
# Training Args
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = "epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,

)

# Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer)




In [34]:
# Initialize the Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator
)
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1697,0.059666
2,0.0357,0.052864
3,0.015,0.056057


TrainOutput(global_step=2634, training_loss=0.059083474433684585, metrics={'train_runtime': 527.3691, 'train_samples_per_second': 79.874, 'train_steps_per_second': 4.995, 'total_flos': 1020143109346326.0, 'train_loss': 0.059083474433684585, 'epoch': 3.0})

In [83]:
model.save_pretrained('ner_model')
tokenizer.save_pretrained('tokenizer')

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [97]:
label_list = ner_dataset["train"].features["ner_tags"].feature.names

label_mapping = {f"LABEL_{i}": label for i, label in enumerate(label_list)}
label_mapping

{'LABEL_0': 'O',
 'LABEL_1': 'B-PER',
 'LABEL_2': 'I-PER',
 'LABEL_3': 'B-ORG',
 'LABEL_4': 'I-ORG',
 'LABEL_5': 'B-LOC',
 'LABEL_6': 'I-LOC',
 'LABEL_7': 'B-MISC',
 'LABEL_8': 'I-MISC'}

In [102]:
from transformers import pipeline

ner_model = pipeline("ner", model="ner_model", tokenizer="tokenizer")
text = "My name is Elon Musk, I am from Mars"
results = ner_model(text)
results


Device set to use cuda:0


[{'entity': 'LABEL_0',
  'score': 0.9997913,
  'index': 1,
  'word': 'my',
  'start': 0,
  'end': 2},
 {'entity': 'LABEL_0',
  'score': 0.99964106,
  'index': 2,
  'word': 'name',
  'start': 3,
  'end': 7},
 {'entity': 'LABEL_0',
  'score': 0.9997352,
  'index': 3,
  'word': 'is',
  'start': 8,
  'end': 10},
 {'entity': 'LABEL_1',
  'score': 0.9961606,
  'index': 4,
  'word': 'el',
  'start': 11,
  'end': 13},
 {'entity': 'LABEL_1',
  'score': 0.9975944,
  'index': 5,
  'word': '##on',
  'start': 13,
  'end': 15},
 {'entity': 'LABEL_2',
  'score': 0.9981871,
  'index': 6,
  'word': 'mu',
  'start': 16,
  'end': 18},
 {'entity': 'LABEL_2',
  'score': 0.99860376,
  'index': 7,
  'word': '##sk',
  'start': 18,
  'end': 20},
 {'entity': 'LABEL_0',
  'score': 0.99977535,
  'index': 8,
  'word': ',',
  'start': 20,
  'end': 21},
 {'entity': 'LABEL_0',
  'score': 0.9998031,
  'index': 9,
  'word': 'i',
  'start': 22,
  'end': 23},
 {'entity': 'LABEL_0',
  'score': 0.99976975,
  'index': 10,
 

In [103]:
for result in results:
  if result['entity'] in label_mapping:
    result['entity'] = label_mapping[result['entity']]
results

[{'entity': 'O',
  'score': 0.9997913,
  'index': 1,
  'word': 'my',
  'start': 0,
  'end': 2},
 {'entity': 'O',
  'score': 0.99964106,
  'index': 2,
  'word': 'name',
  'start': 3,
  'end': 7},
 {'entity': 'O',
  'score': 0.9997352,
  'index': 3,
  'word': 'is',
  'start': 8,
  'end': 10},
 {'entity': 'B-PER',
  'score': 0.9961606,
  'index': 4,
  'word': 'el',
  'start': 11,
  'end': 13},
 {'entity': 'B-PER',
  'score': 0.9975944,
  'index': 5,
  'word': '##on',
  'start': 13,
  'end': 15},
 {'entity': 'I-PER',
  'score': 0.9981871,
  'index': 6,
  'word': 'mu',
  'start': 16,
  'end': 18},
 {'entity': 'I-PER',
  'score': 0.99860376,
  'index': 7,
  'word': '##sk',
  'start': 18,
  'end': 20},
 {'entity': 'O',
  'score': 0.99977535,
  'index': 8,
  'word': ',',
  'start': 20,
  'end': 21},
 {'entity': 'O',
  'score': 0.9998031,
  'index': 9,
  'word': 'i',
  'start': 22,
  'end': 23},
 {'entity': 'O',
  'score': 0.99976975,
  'index': 10,
  'word': 'am',
  'start': 24,
  'end': 26},


In [100]:
def process_entity(results):
    combined_entities = {}
    current_entity = []
    current_label = None

    for result in results:
        if '-B' in result['entity']:
            if current_entity:
                combined_entities[' '.join(current_entity)] = current_label.split('-')[1]
                current_entity = []

                current_label = result['entity']
                current_entity.append(result['word'])
        elif 'I-' in result['entity'] and current_label and result['entity'].split('-')[1] == current_label.split('-')[1]:
            current_entity.append(result['word'])

        else:
            if current_entity:
                combined_entities[' '.join(current_entity)] = current_label.split('-')[1]
                current_entity = []

            current_label = result['entity'] if 'B-' in result['entity'] else None
            if current_label:
                current_entity.append(result['word'])
    if current_entity:
        combined_entities[' '.join(current_entity)] = current_label.split('-')[1]

    return combined_entities


In [104]:
process_entity(results)

{'el': 'PER', '##on mu ##sk': 'PER', 'mars': 'LOC'}