<a href="https://colab.research.google.com/github/marendtz/ModelNER/blob/master/XLMR_finetune_pan_and_custom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Preparations**

In [44]:
# for debugging: clean up model
import shutil
try:
  shutil.rmtree('/content/maren-hugg/xlm-roberta-base-finetuned-panx-en-news')
except:
  print("no such folder")

In [3]:
# get github access token for collab
import getpass
token = getpass.getpass()

··········


In [4]:
# clean workdir
%cd /content/
%rm -r /content/*
%ls -la
%pwd

/content
total 16
drwxr-xr-x 1 root root 4096 Jun 12 06:48 [0m[01;34m.[0m/
drwxr-xr-x 1 root root 4096 Jun 12 06:05 [01;34m..[0m/
drwxr-xr-x 4 root root 4096 Jun  8 18:17 [01;34m.config[0m/


'/content'

In [5]:
# clone github repo
!git clone https://{token}@github.com/marendtz/ModelNER.git

Cloning into 'ModelNER'...
remote: Enumerating objects: 57, done.[K
remote: Counting objects: 100% (57/57), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 57 (delta 29), reused 23 (delta 7), pack-reused 0[K
Unpacking objects: 100% (57/57), 841.16 KiB | 2.34 MiB/s, done.


In [6]:
# install dependencies
%pip install -r /content/ModelNER/requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
# import packages
from collections import defaultdict
import pandas as pd
import numpy as np
from datasets import load_dataset_builder, get_dataset_config_names, load_dataset, DatasetDict

from transformers import AutoTokenizer, TrainingArguments, DataCollatorForTokenClassification, Trainer
import nltk.data
nltk.download('punkt')

import torch.nn as nn
from transformers import XLMRobertaConfig, AutoConfig, XLMRobertaForTokenClassification
from transformers.modeling_outputs import TokenClassifierOutput

from huggingface_hub import notebook_login
from seqeval.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report
from seqeval.scheme import IOB2


from torch.nn.functional import cross_entropy
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**Get and Analyze dataset from Huggingface**

In [8]:
# login to HugginFace to enable upload of model
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [57]:
# define model

# get info from dataset later used for training our model
# load dataset to get the tags used, so we later take the same...

ds = load_dataset("maren-hugg/sustainability_ner")
tags = ds["train"].features["ner_tags"].feature
print(tags)

# generate function to convert name and id of tags
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}
print(index2tag)
print(tag2index)



  0%|          | 0/3 [00:00<?, ?it/s]

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)
{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


**Prepare input data for following training of the model XLMRobertaForTokenClassification (with our config=labels, device, ...)**

In [46]:
# prepare dataset
ds_custom = ds
print("Example element in Custom Dataset:")
ds_custom_element = ds["train"][1]
for key, value in ds_custom_element.items():
    print(f"{key}: {value}")

Example element in Custom Dataset:
ner_tags: [3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0]
tokens: ['Porsche', 'AG', 'is', 'also', 'a', 'member', 'of', 'the', 'employers', "'", 'association', 'Suedwestmetall', 'and', 'is', 'therefore', 'part', 'of', 'the', 'social', 'partnership', 'actively', 'practiced', 'between', 'the', 'metal', 'and', 'electrical', 'industry', 'and', 'the', 'IG', 'Metall', 'trade', 'union', '.']


In [58]:
# show info about tag feature --> we already defined the varibale tags earlier
# --> DatasetDict holds information about all features used (like infos on columns in dataframe)
print('----------features----------------')
print(ds_custom["train"].features)
print('----------feature ner_tags----------------')
print(ds_custom["train"].features["ner_tags"].feature)


----------features----------------
{'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None), 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}
----------feature ner_tags----------------
ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [48]:
# examples
print(ds_custom["train"][1])
print(ds_custom["train"][1]["tokens"])
print(ds_custom["train"][1]["ner_tags"])

{'ner_tags': [3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0], 'tokens': ['Porsche', 'AG', 'is', 'also', 'a', 'member', 'of', 'the', 'employers', "'", 'association', 'Suedwestmetall', 'and', 'is', 'therefore', 'part', 'of', 'the', 'social', 'partnership', 'actively', 'practiced', 'between', 'the', 'metal', 'and', 'electrical', 'industry', 'and', 'the', 'IG', 'Metall', 'trade', 'union', '.']}
['Porsche', 'AG', 'is', 'also', 'a', 'member', 'of', 'the', 'employers', "'", 'association', 'Suedwestmetall', 'and', 'is', 'therefore', 'part', 'of', 'the', 'social', 'partnership', 'actively', 'practiced', 'between', 'the', 'metal', 'and', 'electrical', 'industry', 'and', 'the', 'IG', 'Metall', 'trade', 'union', '.']
[3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0]


In [49]:
count_per = 0
count_org = 0
count_loc = 0
for sentence in ds_custom["train"]:
  tags = sentence["ner_tags"]
  count_per = count_per + tags.count(1)
  count_org = count_org + tags.count(3)
  count_loc = count_loc + tags.count(5)

for sentence in ds_custom["validation"]:
  tags = sentence["ner_tags"]
  count_per = count_per + tags.count(1)
  count_org = count_org + tags.count(3)
  count_loc = count_loc + tags.count(5)

for sentence in ds_custom["test"]:
  tags = sentence["ner_tags"]
  count_per = count_per + tags.count(1)
  count_org = count_org + tags.count(3)
  count_loc = count_loc + tags.count(5)

print(count_per)
print(count_org)
print(count_loc)
print(count_per + count_org + count_loc)

91
1716
629
2436


**Set tokenizer**

In [50]:
# setup for tokenization
xlmr_model_name = "maren-hugg/xlm-roberta-base-finetuned-panx-en"
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

**Define helper function for data preparation**

In [51]:
# the library datasets provides a fast way to tokenize dataset objects with the map() operation.
# the returned input ids need to be augmented with the attention mask and the label ids, that encode the information about with token is associated with each NER tag

def tokenize_and_align_labels(examples):
  tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)  # sequence is already split in words
  labels = []
  for idx, label in enumerate(examples["ner_tags"]):
    word_ids = tokenized_inputs.word_ids(batch_index=idx) # get word ids to associate subwords
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
      if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100) # -100 is chose, since the PyTorch cross-entropy loss class torch.nn.CrossEntropyLoss has an attribute ignore_index, whose value is -100, which is therefor ignored during training
      else:
        label_ids.append(label[word_idx])
      previous_word_idx = word_idx
    labels.append(label_ids)
  tokenized_inputs["labels"] = labels
  return tokenized_inputs

def encode_ds_custom_dataset(corpus):
  return corpus.map(tokenize_and_align_labels, batched=True,remove_columns=['ner_tags', 'tokens'])



In [56]:
# encode dataset
ds_custom_encoded = encode_ds_custom_dataset(ds_custom)
print("Encoded example:")
print(ds_custom_encoded["train"][1])



Encoded example:
{'input_ids': [0, 81201, 24639, 83, 2843, 10, 32786, 111, 70, 143889, 7, 242, 125413, 1832, 297, 25617, 185790, 136, 83, 127298, 2831, 111, 70, 2265, 165410, 36457, 538, 41361, 71, 17721, 70, 12924, 136, 39108, 289, 53099, 136, 70, 79883, 94492, 52350, 69941, 6, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 3, 4, 0, 0, 0, 0, 0, 0, 0, -100, 0, 0, 3, -100, -100, -100, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100, 0, -100, 0, 0, 0, 0, 0, -100, 0, 0, 0, 3, 4, 0, 0, 0, -100, -100]}


**Define performance measures**

In [53]:
# common approach: results for precision, recall, F1-score
# in NER: all words of an entity need to be predicted correctly in order for a prediction to be counted as correct
# libary seqeval: expects predictions and labels as lists of lists 


# during training we need to convert the outputs of the model into such a list that sequeval expects
def align_predictions(predictions, label_ids):
  preds = np.argmax(predictions, axis=2)
  batch_size, seq_len = preds.shape
  labels_list, preds_list = [], []

  for batch_idx in range(batch_size):
    example_labels, example_preds = [], []
    for seq_idx in range(seq_len):
      # Ignore label IDs = -100
      if label_ids[batch_idx, seq_idx] != -100:
        example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
        example_preds.append(index2tag[preds[batch_idx][seq_idx]])
    labels_list.append(example_labels)
    preds_list.append(example_preds)
  return preds_list, labels_list

# helper function for calculation of F1-score for validation set
def compute_metrics(eval_pred):
  y_pred, y_true = align_predictions(eval_pred.predictions, eval_pred.label_ids)
  return {"f1": f1_score(y_true, y_pred),
          "precision": precision_score(y_true, y_pred),
          "recall": recall_score(y_true, y_pred),
          "accuracy": f1_score(y_true, y_pred),
          "classification_report": classification_report(y_true, y_pred, mode='strict', scheme=IOB2)
          }


**Finetuning XLM-RoBERTa and Upload to Huggingface**

In [59]:
# hide_output

import torch

# passing keyword arguments to the from_pretrained() method overrides default values
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,num_labels=tags.num_classes, id2label=index2tag, label2id=tag2index)

# define "device"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cuda


In [85]:
# docu: https://huggingface.co/docs/transformers/v4.28.1/en/main_classes/trainer#transformers.TrainingArguments
num_epochs = 3
batch_size = 16
logging_steps = len(ds_custom_encoded["train"]) // batch_size 
model_name = f"{xlmr_model_name}-custom"
training_args = TrainingArguments(output_dir=model_name, log_level="error", num_train_epochs=num_epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  evaluation_strategy="steps", eval_steps=batch_size, # Number of update steps between two evaluations if evaluation_strategy="steps".
                                  save_strategy="steps", save_steps=1e6, 
                                  weight_decay=0.01,
                                  disable_tqdm=False, 
                                  logging_strategy="steps", logging_steps=batch_size,  # Number of update steps between two logs if logging_strategy="steps".
                                  push_to_hub=True)

In [86]:
# this model needs to be crated at huggingface:
f"{xlmr_model_name}-custom"

'maren-hugg/xlm-roberta-base-finetuned-panx-en-custom'

In [64]:
# pad each input sequence to the largest sequence length in a batch
data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [87]:
# init method, to avoid initializing a new model for every Trainer (it loads the untrained model and is called at the beginning of the train() call)
def model_init():
  return (XLMRobertaForTokenClassification
          .from_pretrained(xlmr_model_name, config=xlmr_config)
          .to(device))

In [88]:
trainer = Trainer(model_init=model_init, args=training_args,
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  train_dataset=ds_custom_encoded["train"],
                  eval_dataset=ds_custom_encoded["validation"],
                  tokenizer=xlmr_tokenizer)

/content/maren-hugg/xlm-roberta-base-finetuned-panx-en-custom is already a clone of https://huggingface.co/maren-hugg/xlm-roberta-base-finetuned-panx-en-custom. Make sure you pull the latest changes with `repo.git_pull()`.


In [89]:
trainer.train()



Step,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy,Classification Report
16,0.1384,0.108498,0.832934,0.804012,0.864013,0.832934,precision recall f1-score support  LOC 0.77 0.84 0.80 153  ORG 0.84 0.87 0.86 423  PER 0.92 0.81 0.86 27  micro avg 0.83 0.86 0.84 603  macro avg 0.84 0.84 0.84 603 weighted avg 0.83 0.86 0.84 603
32,0.0953,0.099687,0.860577,0.832558,0.890547,0.860577,precision recall f1-score support  LOC 0.87 0.79 0.83 153  ORG 0.83 0.92 0.87 423  PER 0.96 0.85 0.90 27  micro avg 0.84 0.88 0.86 603  macro avg 0.89 0.85 0.87 603 weighted avg 0.85 0.88 0.86 603
48,0.051,0.10126,0.875817,0.863124,0.888889,0.875817,precision recall f1-score support  LOC 0.87 0.86 0.86 153  ORG 0.86 0.90 0.88 423  PER 0.96 0.85 0.90 27  micro avg 0.87 0.89 0.88 603  macro avg 0.90 0.87 0.88 603 weighted avg 0.87 0.89 0.88 603
64,0.0583,0.106614,0.861931,0.830769,0.895522,0.861931,precision recall f1-score support  LOC 0.88 0.85 0.86 153  ORG 0.83 0.91 0.87 423  PER 0.89 0.93 0.91 27  micro avg 0.84 0.89 0.87 603  macro avg 0.87 0.89 0.88 603 weighted avg 0.84 0.89 0.87 603
80,0.0349,0.097914,0.883495,0.862559,0.905473,0.883495,precision recall f1-score support  LOC 0.88 0.87 0.88 153  ORG 0.87 0.91 0.89 423  PER 0.89 0.93 0.91 27  micro avg 0.87 0.90 0.89 603  macro avg 0.88 0.90 0.89 603 weighted avg 0.87 0.90 0.89 603
96,0.0395,0.098018,0.880196,0.865385,0.895522,0.880196,precision recall f1-score support  LOC 0.88 0.87 0.87 153  ORG 0.88 0.90 0.89 423  PER 0.89 0.89 0.89 27  micro avg 0.88 0.89 0.88 603  macro avg 0.88 0.89 0.88 603 weighted avg 0.88 0.89 0.88 603


TrainOutput(global_step=96, training_loss=0.06956147899230321, metrics={'train_runtime': 10.1328, 'train_samples_per_second': 148.034, 'train_steps_per_second': 9.474, 'total_flos': 61213573662120.0, 'train_loss': 0.06956147899230321, 'epoch': 3.0})

In [90]:
trainer.push_to_hub(commit_message="Training completed!")

Upload file runs/Jun12_07-44-54_32324690376c/events.out.tfevents.1686555907.32324690376c.8733.7:   0%|        …

Upload file training_args.bin:   0%|          | 1.00/3.87k [00:00<?, ?B/s]

To https://huggingface.co/maren-hugg/xlm-roberta-base-finetuned-panx-en-custom
   9610185..96a24bc  main -> main

   9610185..96a24bc  main -> main

To https://huggingface.co/maren-hugg/xlm-roberta-base-finetuned-panx-en-custom
   96a24bc..4249c17  main -> main

   96a24bc..4249c17  main -> main



'https://huggingface.co/maren-hugg/xlm-roberta-base-finetuned-panx-en-custom/commit/96a24bc7c3e72b99d3db227da606169dbeba861c'

In [91]:
# info about trainer log history
metrics_train=trainer.state.log_history
for item in metrics_train:
  print(item)

{'loss': 0.1384, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.5, 'step': 16}
{'eval_loss': 0.10849767923355103, 'eval_f1': 0.8329336530775381, 'eval_precision': 0.8040123456790124, 'eval_recall': 0.8640132669983416, 'eval_accuracy': 0.8329336530775381, 'eval_classification_report': '              precision    recall  f1-score   support\n\n         LOC       0.77      0.84      0.80       153\n         ORG       0.84      0.87      0.86       423\n         PER       0.92      0.81      0.86        27\n\n   micro avg       0.83      0.86      0.84       603\n   macro avg       0.84      0.84      0.84       603\nweighted avg       0.83      0.86      0.84       603\n', 'eval_runtime': 0.4469, 'eval_samples_per_second': 559.389, 'eval_steps_per_second': 35.801, 'epoch': 0.5, 'step': 16}
{'loss': 0.0953, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0, 'step': 32}
{'eval_loss': 0.09968652576208115, 'eval_f1': 0.8605769230769231, 'eval_precision': 0.8325581395348837, 'eval_recal