# NLP Exercise 6: Named Entity Recognition (NER)
---

Part-of-speech tagging (POS): mark each word in a sentence as corresponding to a particular part of speech.

- O: the word does not correspond to any entity.
- B-PER/I-PER: corresponds to the begginning/inside a person entity.
- B-ORG/I-ORG: corresponds to the begginning/inside an organization entity.
- B-LOC/I-LOC: corresponds to the begginning/inside a location entity.
- B-MISC/I-MISC: corresponds to the begginning/inside a miscellaneous entity.

You can get more information about the dataset we used below in the link:
https://huggingface.co/datasets/eriktks/conll2003

## Preprocessing

### Import libraries

In [None]:
import datasets
import numpy as np
import evaluate
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments

### Understanding the data

In [None]:
# Load Datasets
ner_dataset = datasets.load_dataset('conll2003', trust_remote_code=True)

In [None]:
ner_dataset

In [None]:
ner_dataset['train'][0]

In [None]:
# Define tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
# Tokenize a sample to see the output
text = ner_dataset['train'][0]
tokenized_text = tokenizer(text['tokens'], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_text['input_ids'])
word_ids = tokenized_text.word_ids()

print(f"{tokenized_text} \n")
print(f"{tokens} \n")
print(word_ids)

### Assigning Labels to Tokens


In [None]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
        # Special tokens like `` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# Apply the function to the entire dataset
tokenized_dataset = ner_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
tokenized_dataset['train'][0]

## Training Loop

### Define the model and training arguements

In [None]:
# Labels list
label_list = ner_dataset["train"].features["ner_tags"].feature.names
label_list

In [None]:
# Model
model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(label_list))

In [None]:
# Training Args
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = "epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,

)

# Data Collator
data_collator = DataCollatorForTokenClassification(tokenizer)


In [None]:
# Initialize the Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator
)
trainer.train()

## Save and test model

In [None]:
model.save_pretrained('ner_model')
tokenizer.save_pretrained('tokenizer')

### Postprocessing

In [None]:
label_list = ner_dataset["train"].features["ner_tags"].feature.names

label_mapping = {f"LABEL_{i}": label for i, label in enumerate(label_list)}
label_mapping

In [None]:
from transformers import pipeline

ner_model = pipeline("ner", model="ner_model", tokenizer="tokenizer")
text = "My name is Elon Musk, I am from Mars"
results = ner_model(text)
results

In [None]:
# Convert to a better-to-read version
for result in results:
  if result['entity'] in label_mapping:
    result['entity'] = label_mapping[result['entity']]
results

In [None]:
def process_entity(results):
    combined_entities = {}
    current_entity = []
    current_label = None

    for result in results:
        if '-B' in result['entity']:
            if current_entity:
                combined_entities[' '.join(current_entity)] = current_label.split('-')[1]
                current_entity = []

                current_label = result['entity']
                current_entity.append(result['word'])
        elif 'I-' in result['entity'] and current_label and result['entity'].split('-')[1] == current_label.split('-')[1]:
            current_entity.append(result['word'])

        else:
            if current_entity:
                combined_entities[' '.join(current_entity)] = current_label.split('-')[1]
                current_entity = []

            current_label = result['entity'] if 'B-' in result['entity'] else None
            if current_label:
                current_entity.append(result['word'])
    if current_entity:
        combined_entities[' '.join(current_entity)] = current_label.split('-')[1]

    return combined_entities


In [None]:
process_entity(results)