<a href="https://colab.research.google.com/github/nikotang/rise-multinerd/blob/main/rise_multinerd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment: Research Engineer in Natural Language Processing
## RISE Research Institutes of Sweden

This notebook finetunes two language models on the English examples in MultiNERD.

## System A

### Installations and imports

In [None]:
!pip install -U transformers datasets evaluate seqeval accelerate

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import load_dataset
import evaluate

import numpy as np
import gc
import torch
from collections import defaultdict
import json

### Pre-process the dataset

In [None]:
# fetch tokenizer and dataset
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
dataset = load_dataset('Babelscape/multinerd')

In [None]:
# filter dataset to only contain English data
eng_dataset = dataset.filter(lambda batch: [lang=='en' for lang in batch['lang']], batched=True)

In [5]:
def tokenize_and_align_labels(examples):
  '''
  Tokenize Dataset or DatasetDict, and set labels for non-first subtokens as -100 to ignore loss calculation.
  '''
  tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True) # the examples are already split into words
  labels = []
  for i, label in enumerate(examples['ner_tags']):
    word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:  # Set the special tokens to -100
      if word_idx is None:
        label_ids.append(-100)
      elif word_idx != previous_word_idx:  # Only label the first token of a given word
        label_ids.append(label[word_idx])
      else:
        label_ids.append(-100)
      previous_word_idx = word_idx
    labels.append(label_ids)
  tokenized_inputs['labels'] = labels
  return tokenized_inputs

In [None]:
tokenized_eng = eng_dataset.map(tokenize_and_align_labels, batched=True)

### Create dataloader and look-up dicts for model and training setup

In [7]:
# set data collator, pads to len(longest example of the batch)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [8]:
# set up look-up dictionaries for the model
# label2id dictionary from https://huggingface.co/datasets/Babelscape/multinerd
label2id = {
  "O": 0,
  "B-PER": 1,
  "I-PER": 2,
  "B-ORG": 3,
  "I-ORG": 4,
  "B-LOC": 5,
  "I-LOC": 6,
  "B-ANIM": 7,
  "I-ANIM": 8,
  "B-BIO": 9,
  "I-BIO": 10,
  "B-CEL": 11,
  "I-CEL": 12,
  "B-DIS": 13,
  "I-DIS": 14,
  "B-EVE": 15,
  "I-EVE": 16,
  "B-FOOD": 17,
  "I-FOOD": 18,
  "B-INST": 19,
  "I-INST": 20,
  "B-MEDIA": 21,
  "I-MEDIA": 22,
  "B-MYTH": 23,
  "I-MYTH": 24,
  "B-PLANT": 25,
  "I-PLANT": 26,
  "B-TIME": 27,
  "I-TIME": 28,
  "B-VEHI": 29,
  "I-VEHI": 30,
}
id2label = {v:k for k,v in label2id.items()}

### Set up evaluation metrics

In [None]:
label_list = list(label2id.keys())

seqeval = evaluate.load('seqeval')

def set_compute_metrics(label_list):
  def compute_metrics(p):
    nonlocal label_list     # available with python>=3.x
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
      [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
      [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
      'precision': results['overall_precision'],
      'recall': results['overall_recall'],
      'f1': results['overall_f1'],
      'accuracy': results['overall_accuracy'],
    }
  return compute_metrics

### Fine-tune

In [None]:
# make space
gc.collect()
torch.cuda.empty_cache()

# set arguments
training_args = TrainingArguments(
    output_dir=f'./a_results',
    num_train_epochs=3,
    max_steps=30000,                        # overrides training epochs
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=1000,
    learning_rate=5e-5,
    weight_decay=5e-4,
    log_level='info',
    logging_dir=f'./a_logs',
    logging_steps=1000,
    evaluation_strategy='steps',
    eval_steps=1000,
    save_steps=1000,
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',      # determine 'best' according to eval loss
    greater_is_better=False,
    dataloader_drop_last=True,              # stops when what remains is less than a batch when training by steps
    disable_tqdm=False
)

# load the model
model = AutoModelForTokenClassification.from_pretrained('bert-base-cased',
                                                        num_labels=len(id2label),
                                                        id2label=id2label,
                                                        label2id=label2id,
                                                        hidden_dropout_prob=0.5,
                                                        ).to('cuda')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_eng['train'],
    eval_dataset=tokenized_eng['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=set_compute_metrics(label_list),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]      # checks 5 more steps before early stopping
)

trainer.train()

trainer.save_model()

## System B

Train a model that will predict only five entity types and the O tag (I.e. not part of an entity).

All
examples should thus remain, but entity types not belonging to one of the following five should be set to zero: PERSON(PER), ORGANIZATION(ORG), LOCATION(LOC), DISEASES(DIS),
ANIMAL(ANIM)

### Modify dataset labels

In [None]:
label_list_B = ['O',
    'B-PER',
    'I-PER',
    'B-ORG',
    'I-ORG',
    'B-LOC',
    'I-LOC',
    'B-ANIM',
    'I-ANIM',
    'B-DIS',
    'I-DIS'
    ]

In [None]:
# the same label-id correspondence cannot be kept because the training process only allows label ids of range(0:number of classifications)
label2id_B = {l:i for i,l in enumerate(label_list_B)}
id2label_B = {v:k for k,v in label2id_B.items()}

In [None]:
# map the tokenized dataset to the simpler set of labels

# map system A ids to system B ids, set the rest to 0
A2B_mapping = defaultdict(lambda:0, {label2id[label]:label2id_B[label] for label in label_list_B})
A2B_mapping[-100] = -100        # for special tokens and trailing subtokens of NER entities

def apply_mapping(label):
  return A2B_mapping[label]

def system_B_labels(example):
  vmap = np.vectorize(apply_mapping)
  for i, tags in enumerate(example['ner_tags']):
    example['ner_tags'][i] = vmap(tags)
  for i, tags in enumerate(example['labels']):
    example['labels'][i] = vmap(tags)
  return example

In [None]:
tokenized_eng_B = tokenized_eng.map(system_B_labels, batched=True)

### Fine-tune system B

In [None]:
gc.collect()
torch.cuda.empty_cache()

CUDA_VISIBLE_DEVICES=0

training_args_b = TrainingArguments(
    output_dir=f'./b_results',
    num_train_epochs=3,
    max_steps=30000,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=1000,
    learning_rate=5e-5,
    weight_decay=5e-4,
    log_level='info',
    logging_dir=f'./b_logs',
    logging_steps=1000,
    evaluation_strategy='steps',
    eval_steps=1000,
    save_steps=1000,
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    dataloader_drop_last=True,
    disable_tqdm=False
    )

model_b = AutoModelForTokenClassification.from_pretrained('bert-base-cased',
                                                        num_labels=len(id2label_B),
                                                        id2label=id2label_B,
                                                        label2id=label2id_B,
                                                        hidden_dropout_prob=0.5
                                                        ).to('cuda')

trainer = Trainer(
    model=model_b,
    args=training_args_b,
    train_dataset=tokenized_eng_B['train'],
    eval_dataset=tokenized_eng_B['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=set_compute_metrics(label_list_B),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

trainer.train()

trainer.save_model()

## Evaluate test set

In [None]:
# System A

test_model = AutoModelForTokenClassification.from_pretrained('./a_results').to('cuda')

test_args = TrainingArguments(
    output_dir = './a_test',
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 64
)

tester = Trainer(
              model = test_model,
              args = test_args,
              data_collator=data_collator,
              compute_metrics = set_compute_metrics(label_list)
)

a_results = tester.evaluate(eval_dataset=tokenized_eng['test'])
print(a_results)

In [None]:
# export results
with open(f'a_results.json', 'w') as fout:
  json.dump(a_results, fout, indent=4)

In [None]:
# System B

test_model = AutoModelForTokenClassification.from_pretrained('./b_results').to('cuda')

test_args = TrainingArguments(
    output_dir = './b_test',
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 64
)

tester = Trainer(
              model = test_model,
              args = test_args,
              data_collator=data_collator,
              compute_metrics = set_compute_metrics(label_list_B)
)

b_results = tester.evaluate(eval_dataset=tokenized_eng_B['test'])
print(b_results)

In [None]:
# export results
with open(f'b_results.json', 'w') as fout:
  json.dump(b_results, fout, indent=4)