In [1]:
# Import all necessary libraries

import transformers
import torch
import datasets

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification, AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
from datasets import load_dataset, load_metric, DatasetDict
from transformers import TrainingArguments, Trainer



In [2]:
# Activate GPU for training 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
# We needed to install the evaluation metric for this task

!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=18b10386717b4845a1e139f7084727c0f879766ac59422b63b9e4112ce5a1db7
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [4]:
# Import the dataset directly from HuggingFace

multinerd = load_dataset("Babelscape/multinerd")

Downloading and preparing dataset json/Babelscape--multinerd to /root/.cache/huggingface/datasets/json/Babelscape--multinerd-d3bf0284fd817c7b/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/32.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/37.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/50.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.22M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.65M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.85M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.51M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.29M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.82M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.25M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.47M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/Babelscape--multinerd-d3bf0284fd817c7b/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# Originally, two segments - train and test. Each segment contains tokens, ner_tags and information about language
multinerd

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 1339200
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 167993
    })
})

In [6]:
# Checking the dimensions of data

multinerd.shape

{'train': (1339200, 3), 'test': (167993, 3)}

In [7]:
# Filter out the non-English examples

multinerd = multinerd.filter(lambda sample: sample['lang'] == 'en')

  0%|          | 0/1340 [00:00<?, ?ba/s]

  0%|          | 0/168 [00:00<?, ?ba/s]

In [8]:
# Manually create a val set from training set, while maintaining train and test in their original form

train_dataset = multinerd['train']
test_dataset = multinerd['test']

split_ratio = 0.8
num_train_samples = int(len(train_dataset) * split_ratio)

train_set = train_dataset.select(list(range(num_train_samples)))
val_set = train_dataset.select(list(range(num_train_samples, len(train_dataset))))

# Update the DatasetDict with the sets
multinerd = DatasetDict({
    'train': train_set,
    'valid': val_set,
    'test': test_dataset
})

In [9]:
multinerd

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 105024
    })
    valid: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 26256
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'lang'],
        num_rows: 16454
    })
})

In [10]:
# For this task, we aim for the cased base BERT tokenizer 
# Cased because capitalization most likely will play an important role in the NER task

# Import the BERT tokenizer


tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [11]:
# Word Piece tokenization of BERT causes mismatch between actual NER tags and tokens
# As words can be split again by tokenizer (##), as exemplified in the following chunks

example = train_dataset[123]
print(example)

tokenized_example = tokenizer(example['tokens'], is_split_into_words=True)

tokens = tokenizer.convert_ids_to_tokens(tokenized_example['input_ids'])

word_ids = tokenized_example.word_ids()

{'tokens': ['The', 'campaign', 'was', 'organized', ',', 'among', 'others', ',', 'by', 'Abbie', 'Hoffman', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0], 'lang': 'en'}


In [12]:
# Mismatch in length

len(example['ner_tags']), len(tokenized_example['input_ids'])

(12, 16)

In [13]:
# This is due to the presence of special tokens and subwords (typical of BERT), as figured in this example

tokens

['[CLS]',
 'The',
 'campaign',
 'was',
 'organized',
 ',',
 'among',
 'others',
 ',',
 'by',
 'A',
 '##bb',
 '##ie',
 'Hoffman',
 '.',
 '[SEP]']

In [14]:
def tokenize_align(example, label_all_tokens=True):
    '''
    Tokenize and align the NER labels with the corresponding tokens in a given example.

    Args:
        example (dict): A dictionary containing 'tokens' and 'ner_tags' for a single example.
        label_all_tokens (bool, optional): If True, assigns NER labels to all tokens; if False, assigns labels only to the first token of each word.

    Returns:
        dict: Tokenized input with aligned NER labels.

    Note:
        Special tokens are marked with -100, and this will be ignored by PyTorch during training.
        Labels are aligned to their corresponding tokens, considering word boundaries and the label_all_tokens flag.

    '''
    
    tokenized_input = tokenizer(example['tokens'], truncation=True, is_split_into_words=True)
    labels = [] 
    
    for i, label in enumerate(example['ner_tags']):
        word_ids = tokenized_input.word_ids(batch_index=i)
        
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special tokens will be ignored during training
            
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])  # Assign NER label to the token
            
            else:
                # If word_idx is not a special token,
                # append the NER tag if label_all_tokens is true, else assign -100
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                    
            previous_word_idx = word_idx
            
        labels.append(label_ids)
    
    # Add processed labels to the tokenized input dictionary
    tokenized_input['labels'] = labels
    
    return tokenized_input

# System A

In [15]:
# We apply the logic to entire dataset

tokenized_data = multinerd.map(tokenize_align, batched=True)

  0%|          | 0/106 [00:00<?, ?ba/s]

  0%|          | 0/27 [00:00<?, ?ba/s]

  0%|          | 0/17 [00:00<?, ?ba/s]

In [16]:
# Import model from HuggingFace

model_a = AutoModelForTokenClassification.from_pretrained('bert-base-cased', num_labels=31)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# We use the data collator to pad tokens so all samples are of same length

data_collator = DataCollatorForTokenClassification(tokenizer)

In [18]:
# Define training parameters

training_args_a = TrainingArguments(
    output_dir="./fine_tune_bert_output_a",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    #logging_steps = 1000,
    report_to="wandb",
    run_name = "RISE_A",
    save_strategy="no"
)

In [19]:
# Create a mapping between indices and tags for readability purposes

label_mapping = {
    0: "O",
    1: "B-PER",
    2: "I-PER",
    3: "B-ORG",
    4: "I-ORG",
    5: "B-LOC",
    6: "I-LOC",
    7: "B-ANIM",
    8: "I-ANIM",
    9: "B-BIO",
    10: "I-BIO",
    11: "B-CEL",
    12: "I-CEL",
    13: "B-DIS",
    14: "I-DIS",
    15: "B-EVE",
    16: "I-EVE",
    17: "B-FOOD",
    18: "I-FOOD",
    19: "B-INST",
    20: "I-INST",
    21: "B-MEDIA",
    22: "I-MEDIA",
    23: "B-MYTH",
    24: "I-MYTH",
    25: "B-PLANT",
    26: "I-PLANT",
    27: "B-TIME",
    28: "I-TIME",
    29: "B-VEHI",
    30: "I-VEHI",
}

label_names_mapped = [label_mapping[label] for label in label_mapping]
len(label_names_mapped)

31

In [20]:
# seqeval metric from HuggingFace as metrics, which is adequate for chunking tasks such as NER

metric = datasets.load_metric('seqeval')

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [21]:
example = train_dataset[144]
print(example)

labels = [label_names_mapped[i] for i in example['ner_tags']]
labels

{'tokens': ['However', ',', 'the', 'plans', 'were', 'revived', 'in', 'August', '2017', ',', 'with', 'the', 'announcement', 'that', 'Paul', 'Scheer', 'would', 'be', 'writing', 'the', 'series', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0], 'lang': 'en'}


['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [22]:
# Test the metric on a small sample

metric.compute(predictions=[labels], references=[labels])

{'PER': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [23]:
def compute_metrics_a(eval_preds):
    
    '''
    Custom function for evaluation metrics of the NER task. It computes accuracy, precision, recall, and F1.
    
    Parameters:
        eval_preds: a tuple with predicted logits and true labels
        
    Returns:
        dictionary with defined metrics
    '''
    
    pred_logits, labels = eval_preds
    pred_logits = np.argmax(pred_logits, axis=2)

    predictions = [
        [label_mapping[prediction] for (prediction, l) in zip(batch_preds, label) if l != -100] 
        for batch_preds, label in zip(pred_logits, labels)
    ]

    true_labels = [
        [label_mapping[l] for (prediction, l) in zip(batch_preds, label) if l != -100] 
        for batch_preds, label in zip(pred_logits, labels)
    ]

    results = metric.compute(predictions=predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [24]:
# Train the model
# NOTE: we train for 3 epochs because of limited GPU access
# however, it is advised to train for more epochs in a NER task

trainer_a = Trainer(
    model=model_a,
    args=training_args_a,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_a
)

trainer_a.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
500,0.2829,0.12135,0.84752,0.725556,0.78181,0.961095
1000,0.0993,0.104465,0.806625,0.81159,0.8091,0.96446
1500,0.0847,0.095065,0.831983,0.820897,0.826403,0.966689
2000,0.0828,0.092473,0.887791,0.782459,0.831803,0.967553
2500,0.075,0.089553,0.864478,0.82124,0.842304,0.969598
3000,0.0721,0.083958,0.858553,0.829143,0.843592,0.970074
3500,0.066,0.08364,0.845466,0.849848,0.847651,0.970321
4000,0.0686,0.080008,0.875519,0.834267,0.854396,0.971793
4500,0.0647,0.076475,0.870606,0.856885,0.863691,0.972998
5000,0.0623,0.076157,0.868724,0.858393,0.863528,0.972823


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=19692, training_loss=0.0518820724084371, metrics={'train_runtime': 6153.7438, 'train_samples_per_second': 51.2, 'train_steps_per_second': 3.2, 'total_flos': 9103295020508832.0, 'train_loss': 0.0518820724084371, 'epoch': 3.0})

In [25]:
# Evaluate on test set 

results = trainer_a.predict(tokenized_data["test"])

In [26]:
pred_logits, labels = results.predictions, results.label_ids
pred_logits = np.argmax(pred_logits, axis=2)

# Flatten the predictions and true labels
flat_true_labels = [label for sequence_labels in labels for label in sequence_labels]
flat_pred_labels = [label for sequence_labels in pred_logits for label in sequence_labels]

# Ignore special tokens (-100) during evaluation
valid_indices = [index for index, label in enumerate(flat_true_labels) if label != -100]

# Use the sklearn classification_report function with label_mapping
report = classification_report(
    np.array(flat_true_labels)[valid_indices],
    np.array(flat_pred_labels)[valid_indices],
    target_names=label_names_mapped
)

print(report)

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           O       0.99      0.99      0.99    329987
       B-PER       0.99      0.99      0.99      7354
       I-PER       1.00      1.00      1.00     11509
       B-ORG       0.98      0.98      0.98      5131
       I-ORG       0.99      0.98      0.99      5693
       B-LOC       1.00      1.00      1.00     19489
       I-LOC       0.99      1.00      0.99      7129
      B-ANIM       0.74      0.79      0.77      3351
      I-ANIM       0.71      0.78      0.75      1658
       B-BIO       0.49      0.88      0.63        34
       I-BIO       0.00      0.00      0.00         0
       B-CEL       0.83      0.88      0.85        56
       I-CEL       0.85      0.77      0.81        22
       B-DIS       0.79      0.82      0.81      2014
       I-DIS       0.78      0.80      0.79       917
       B-EVE       0.95      0.96      0.95       451
       I-EVE       0.95      0.97      0.96       667
      B-FOOD       0.72    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# System B

In [27]:
# Change the original dataset as to retrieve only the entities we need
# while setting all other entities to 0

valid_ner_tags = [1, 2, 3, 4, 5, 6, 7, 8, 13, 14]

# Define a function to modify ner_tags in each sample
def modify_ner_tags(sample):
    # Iterate through ner_tags and replace invalid values with 0
    sample['ner_tags'] = [tag if tag in valid_ner_tags else 0 for tag in sample['ner_tags']]
    return sample

# Apply the modification to the entire dataset
modified_dataset = multinerd.map(modify_ner_tags)

  0%|          | 0/105024 [00:00<?, ?ex/s]

  0%|          | 0/26256 [00:00<?, ?ex/s]

  0%|          | 0/16454 [00:00<?, ?ex/s]

In [28]:
# Apply the logic to entire dataset

tokenized_data_b = modified_dataset.map(tokenize_align, batched=True)

  0%|          | 0/106 [00:00<?, ?ba/s]

  0%|          | 0/27 [00:00<?, ?ba/s]

  0%|          | 0/17 [00:00<?, ?ba/s]

In [29]:
# Create another label mapping but with the needed tags only

new_label_mapping = {
    0: "O",
    1: "B-PER",
    2: "I-PER",
    3: "B-ORG",
    4: "I-ORG",
    5: "B-LOC",
    6: "I-LOC",
    7: "B-ANIM",
    8: "I-ANIM",
    13: "B-DIS",
    14: "I-DIS",
}

In [30]:
# Redefine metric custom function with new dictionary

def compute_metrics_b(eval_preds):
    pred_logits, labels = eval_preds
    pred_logits = np.argmax(pred_logits, axis=2)

    predictions = [
        [new_label_mapping[prediction] for (prediction, l) in zip(batch_preds, label) if l != -100] 
        for batch_preds, label in zip(pred_logits, labels)
    ]

    true_labels = [
        [new_label_mapping[l] for (prediction, l) in zip(batch_preds, label) if l != -100] 
        for batch_preds, label in zip(pred_logits, labels)
    ]

    results = metric.compute(predictions=predictions, references=true_labels)

    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [31]:
training_args_b = TrainingArguments(
    output_dir="./fine_tune_bert_output_b",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    #logging_steps = 1000,
    report_to="wandb",
    run_name = "RISE_B",
    save_strategy="no",
)

In [32]:
model_b = AutoModelForTokenClassification.from_pretrained('bert-base-cased', num_labels=15)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
trainer_b = Trainer(
    model=model_b,
    args=training_args_b,
    train_dataset=tokenized_data_b["train"],
    eval_dataset=tokenized_data_b["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_b
)

trainer_b.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
500,0.1765,0.064446,0.823101,0.888568,0.854583,0.975103
1000,0.0563,0.05177,0.889522,0.891664,0.890592,0.981935
1500,0.0492,0.049213,0.896246,0.88746,0.891831,0.982248
2000,0.0475,0.045664,0.898012,0.907124,0.902545,0.98365
2500,0.0438,0.046967,0.887571,0.915303,0.901224,0.983222
3000,0.0418,0.043719,0.904984,0.899346,0.902157,0.983698
3500,0.0391,0.041508,0.897305,0.92054,0.908774,0.984716
4000,0.0382,0.041052,0.922074,0.895907,0.908802,0.985027
4500,0.0368,0.039253,0.924844,0.906092,0.915372,0.985655
5000,0.0347,0.039243,0.907314,0.927878,0.917481,0.985655


TrainOutput(global_step=19692, training_loss=0.028760366073692086, metrics={'train_runtime': 5969.4722, 'train_samples_per_second': 52.781, 'train_steps_per_second': 3.299, 'total_flos': 9101978528338080.0, 'train_loss': 0.028760366073692086, 'epoch': 3.0})

In [34]:
results_b = trainer_b.predict(tokenized_data_b["test"])

In [35]:
pred_logits, labels = results_b.predictions, results_b.label_ids
pred_logits = np.argmax(pred_logits, axis=2)

# Flatten the predictions and true labels
flat_true_labels = [label for sequence_labels in labels for label in sequence_labels]
flat_pred_labels = [label for sequence_labels in pred_logits for label in sequence_labels]

# Ignore special tokens (-100) during evaluation
valid_indices = [index for index, label in enumerate(flat_true_labels) if label != -100]

# Use the sklearn classification_report function with label_mapping
report_b = classification_report(
    np.array(flat_true_labels)[valid_indices],
    np.array(flat_pred_labels)[valid_indices],
    target_names=list(new_label_mapping.values())
)

print(report_b)

              precision    recall  f1-score   support

           O       1.00      0.99      0.99    337859
       B-PER       0.99      0.99      0.99      7354
       I-PER       1.00      1.00      1.00     11509
       B-ORG       0.98      0.98      0.98      5131
       I-ORG       0.99      0.99      0.99      5693
       B-LOC       1.00      0.99      1.00     19489
       I-LOC       0.99      1.00      0.99      7129
      B-ANIM       0.75      0.78      0.77      3351
      I-ANIM       0.71      0.76      0.74      1658
       B-DIS       0.80      0.83      0.82      2014
       I-DIS       0.78      0.79      0.78       917

    accuracy                           0.99    402104
   macro avg       0.91      0.92      0.91    402104
weighted avg       0.99      0.99      0.99    402104

