<a href="https://colab.research.google.com/github/marendtz/News_NER/blob/master/XLMR_finetune_pan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Preparations**

In [1]:
# for debugging: clean up model
import shutil
try:
  shutil.rmtree('/content/xlm-roberta-base-finetuned-panx-en')
except:
  print("no such folder")

no such folder


In [3]:
# get github access token for collab
import getpass
token = getpass.getpass()

··········


In [4]:
# clean workdir
%cd /content/
%rm -r /content/*
%ls -la
%pwd

/content
total 16
drwxr-xr-x 1 root root 4096 Apr 30 08:01 [0m[01;34m.[0m/
drwxr-xr-x 1 root root 4096 Apr 30 07:56 [01;34m..[0m/
drwxr-xr-x 4 root root 4096 Apr 27 13:34 [01;34m.config[0m/


'/content'

In [6]:
# clone github repo
!git clone https://{token}@github.com/marendtz/News_NER.git

Cloning into 'News_NER'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 18 (delta 6), reused 9 (delta 1), pack-reused 0[K
Unpacking objects: 100% (18/18), 139.20 KiB | 2.96 MiB/s, done.


In [None]:
# install dependencies
%pip install -r /content/News_NER/requirements.txt

In [8]:
# import packages
from collections import defaultdict
import pandas as pd
import numpy as np
from datasets import load_dataset_builder, get_dataset_config_names, load_dataset, DatasetDict

from transformers import AutoTokenizer, TrainingArguments, DataCollatorForTokenClassification, Trainer
import nltk.data
nltk.download('punkt')

import torch.nn as nn
from transformers import XLMRobertaConfig, AutoConfig, XLMRobertaForTokenClassification
from transformers.modeling_outputs import TokenClassifierOutput

from huggingface_hub import notebook_login
from seqeval.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report
from seqeval.scheme import IOB2


from torch.nn.functional import cross_entropy
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


**Get and Analyze dataset from Huggingface**

In [10]:
# define model

# get info from dataset later used for training our model
# load dataset to get the tags used, so we later take the same...
lang = "en"
ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
tags = ds["train"].features["ner_tags"].feature
print(tags)

# generate function to convert name and id of tags
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}
print(index2tag)
print(tag2index)

Downloading builder script:   0%|          | 0.00/37.5k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/593k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/105k [00:00<?, ?B/s]

Downloading and preparing dataset xtreme/PAN-X.en to /root/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4...


Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset xtreme downloaded and prepared to /root/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)
{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}


In [None]:
# helper steps for later (aggregating the test steps from before)
def tag_text(text, tags, model, tokenizer):
  # Get tokens with special characters
  tokens = tokenizer(text).tokens()
  # Encode the sequence into IDs
  input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
  # Get predictions as distribution over 7 possible classes
  outputs = model(input_ids)[0]
  # Take argmax to get most likely class per token
  predictions = torch.argmax(outputs, dim=2)
  # Convert to DataFrame
  preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
  return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])

**Prepare input data for following training of the model XLMRobertaForTokenClassification (with our config=labels, device, ...)**

In [11]:
# show how labels are given in Pan-X subsets from the XTREME dataset and use the same 
xtreme_subsets = get_dataset_config_names("xtreme")
print(f"XTREME has {len(xtreme_subsets)} configurations")

# PAN-X subsets within XTREME dataset
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
print(f"PAN-X has {len(panx_subsets)} configurations")


XTREME has 183 configurations
PAN-X has 40 configurations


In [12]:
# prepare dataset
panx_ch = defaultdict(DatasetDict)

langs = ["en"] # e.g. ["de","en"]
fracs = [1.0] # e.g. [0.5,0.5]
# panx_ch contains keys for each language and one level lower keys for all splits - here only eng and all data is selected
for lang, frac in zip(langs, fracs):
    # Load monolingual corpus
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
    # Shuffle and downsample each split according to spoken proportion
    for split in ds:
        panx_ch[lang][split] = (
            ds[split]
            .shuffle(seed=0)
            .select(range(int(frac * ds[split].num_rows))))
print(panx_ch)
print("Example element in Pan-X Dataset:")
panx_ch_element = panx_ch["en"]["train"][123]
for key, value in panx_ch_element.items():
    print(f"{key}: {value}")



  0%|          | 0/3 [00:00<?, ?it/s]

defaultdict(<class 'datasets.dataset_dict.DatasetDict'>, {'en': DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})})
Example element in Pan-X Dataset:
tokens: ['Anders', 'Oechsler', '(', 'REtires', ')']
ner_tags: [1, 2, 0, 0, 0]
langs: ['en', 'en', 'en', 'en', 'en']


In [13]:
# show info about tag feature --> we already defined the varibale tags earlier
# --> DatasetDict holds information about all features used (like infos on columns in dataframe)
print('----------features----------------')
print(panx_ch["en"]["train"].features)
print('----------feature ner_tags----------------')
print(panx_ch["en"]["train"].features["ner_tags"].feature)


----------features----------------
{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None), 'langs': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}
----------feature ner_tags----------------
ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [15]:
# examples
print(panx_ch["en"]["train"][10])
print(panx_ch["en"]["train"][10]["tokens"])
print(panx_ch["en"]["train"][10]["ner_tags"])

{'tokens': ['Its', 'source', 'is', 'near', 'Mega', 'Dereio', '.'], 'ner_tags': [0, 0, 0, 0, 5, 6, 0], 'langs': ['en', 'en', 'en', 'en', 'en', 'en', 'en']}
['Its', 'source', 'is', 'near', 'Mega', 'Dereio', '.']
[0, 0, 0, 0, 5, 6, 0]


**Set tokenizer**

In [None]:
# setup for tokenization
xlmr_model_name = "xlm-roberta-base"
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

**Define helper function for data preparation**

In [16]:
# the library datasets provides a fast way to tokenize dataset objects with the map() operation.
# the returned input ids need to be augmented with the attention mask and the label ids, that encode the information about with token is associated with each NER tag

def tokenize_and_align_labels(examples):
  tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)  # sequence is already split in words
  labels = []
  for idx, label in enumerate(examples["ner_tags"]):
    word_ids = tokenized_inputs.word_ids(batch_index=idx) # get word ids to associate subwords
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
      if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100) # -100 is chose, since the PyTorch cross-entropy loss class torch.nn.CrossEntropyLoss has an attribute ignore_index, whose value is -100, which is therefor ignored during training
      else:
        label_ids.append(label[word_idx])
      previous_word_idx = word_idx
    labels.append(label_ids)
  tokenized_inputs["labels"] = labels
  return tokenized_inputs

def encode_panx_dataset(corpus):
  return corpus.map(tokenize_and_align_labels, batched=True,remove_columns=['langs', 'ner_tags', 'tokens'])



In [19]:
# encode dataset
panx_en_encoded = encode_panx_dataset(panx_ch["en"])
print("Encoded example:")
print(panx_en_encoded["train"][2])



Encoded example:
{'input_ids': [0, 9079, 7113, 202104, 11491, 6, 4, 9079, 7113, 15, 5106, 210298, 1104, 151210, 6, 4, 20271, 30839, 6, 167618, 5106, 1388, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 5, -100, 6, 6, 0, -100, 5, -100, 0, 0, -100, -100, -100, 0, -100, 0, 3, 4, -100, 0, 0, -100]}


**Define performance measures**

In [20]:
# common approach: results for precision, recall, F1-score
# in NER: all words of an entity need to be predicted correctly in order for a prediction to be counted as correct
# libary seqeval: expects predictions and labels as lists of lists 


# during training we need to convert the outputs of the model into such a list that sequeval expects
def align_predictions(predictions, label_ids):
  preds = np.argmax(predictions, axis=2)
  batch_size, seq_len = preds.shape
  labels_list, preds_list = [], []

  for batch_idx in range(batch_size):
    example_labels, example_preds = [], []
    for seq_idx in range(seq_len):
      # Ignore label IDs = -100
      if label_ids[batch_idx, seq_idx] != -100:
        example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
        example_preds.append(index2tag[preds[batch_idx][seq_idx]])
    labels_list.append(example_labels)
    preds_list.append(example_preds)
  return preds_list, labels_list

# helper function for calculation of F1-score for validation set
def compute_metrics(eval_pred):
  y_pred, y_true = align_predictions(eval_pred.predictions, eval_pred.label_ids)
  return {"f1": f1_score(y_true, y_pred),
          "precision": precision_score(y_true, y_pred),
          "recall": recall_score(y_true, y_pred),
          "accuracy": f1_score(y_true, y_pred),
          "classification_report": classification_report(y_true, y_pred, mode='strict', scheme=IOB2)
          }


**Finetuning XLM-RoBERTa and Upload to Huggingface**

In [21]:
# login to HugginFace to enable upload of model
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
# hide_output

import torch

# passing keyword arguments to the from_pretrained() method overrides default values
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,num_labels=tags.num_classes, id2label=index2tag, label2id=tag2index)

# define "device"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cuda


In [29]:
# docu: https://huggingface.co/docs/transformers/v4.28.1/en/main_classes/trainer#transformers.TrainingArguments
num_epochs = 3
batch_size = 24
logging_steps = len(panx_en_encoded["train"]) // batch_size 
model_name = f"{xlmr_model_name}-finetuned-panx-en"
training_args = TrainingArguments(output_dir=model_name, log_level="error", num_train_epochs=num_epochs,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  evaluation_strategy="steps", eval_steps=batch_size, # Number of update steps between two evaluations if evaluation_strategy="steps".
                                  save_strategy="steps", save_steps=1e6, 
                                  weight_decay=0.01,
                                  disable_tqdm=False, 
                                  logging_strategy="steps", logging_steps=batch_size,  # Number of update steps between two logs if logging_strategy="steps".
                                  push_to_hub=True)

In [23]:
# this model needs to be crated at huggingface:
f"{xlmr_model_name}-finetuned-panx-en"

'xlm-roberta-base-finetuned-panx-en'

In [25]:
# pad each input sequence to the largest sequence length in a batch
data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [26]:
# init method, to avoid initializing a new model for every Trainer (it loads the untrained model and is called at the beginning of the train() call)
def model_init():
  return (XLMRobertaForTokenClassification
          .from_pretrained(xlmr_model_name, config=xlmr_config)
          .to(device))

In [30]:
trainer = Trainer(model_init=model_init, args=training_args,
                  data_collator=data_collator,
                  compute_metrics=compute_metrics,
                  train_dataset=panx_en_encoded["train"],
                  eval_dataset=panx_en_encoded["validation"],
                  tokenizer=xlmr_tokenizer)

/content/xlm-roberta-base-finetuned-panx-en is already a clone of https://huggingface.co/maren-hugg/xlm-roberta-base-finetuned-panx-en. Make sure you pull the latest changes with `repo.git_pull()`.


In [31]:
trainer.train()



Step,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy,Classification Report
24,1.2644,0.817467,0.421248,0.369598,0.489679,0.421248,precision recall f1-score support  LOC 0.28 0.18 0.22 4834  ORG 0.39 0.15 0.22 4677  PER 0.66 0.72 0.69 4635  micro avg 0.49 0.35 0.41 14146  macro avg 0.44 0.35 0.38 14146 weighted avg 0.44 0.35 0.37 14146
48,0.7209,0.563264,0.481697,0.418967,0.566521,0.481697,precision recall f1-score support  LOC 0.35 0.49 0.41 4834  ORG 0.47 0.51 0.49 4677  PER 0.77 0.69 0.73 4635  micro avg 0.50 0.56 0.53 14146  macro avg 0.53 0.57 0.54 14146 weighted avg 0.53 0.56 0.54 14146
72,0.5951,0.467027,0.605884,0.558773,0.661671,0.605884,precision recall f1-score support  LOC 0.60 0.63 0.61 4834  ORG 0.57 0.51 0.54 4677  PER 0.74 0.81 0.78 4635  micro avg 0.64 0.65 0.65 14146  macro avg 0.64 0.65 0.64 14146 weighted avg 0.64 0.65 0.64 14146
96,0.4475,0.442492,0.665884,0.633618,0.701612,0.665884,precision recall f1-score support  LOC 0.65 0.75 0.70 4834  ORG 0.65 0.51 0.57 4677  PER 0.80 0.82 0.81 4635  micro avg 0.70 0.70 0.70 14146  macro avg 0.70 0.70 0.69 14146 weighted avg 0.70 0.70 0.69 14146
120,0.4978,0.446937,0.637484,0.592957,0.689241,0.637484,precision recall f1-score support  LOC 0.66 0.68 0.67 4834  ORG 0.60 0.52 0.56 4677  PER 0.72 0.85 0.78 4635  micro avg 0.66 0.68 0.67 14146  macro avg 0.66 0.68 0.67 14146 weighted avg 0.66 0.68 0.67 14146
144,0.4383,0.40932,0.700302,0.666773,0.737382,0.700302,precision recall f1-score support  LOC 0.75 0.71 0.73 4834  ORG 0.61 0.63 0.62 4677  PER 0.77 0.86 0.81 4635  micro avg 0.71 0.73 0.72 14146  macro avg 0.71 0.73 0.72 14146 weighted avg 0.71 0.73 0.72 14146
168,0.4148,0.368846,0.712246,0.68766,0.738654,0.712246,precision recall f1-score support  LOC 0.71 0.74 0.73 4834  ORG 0.66 0.64 0.65 4677  PER 0.86 0.81 0.84 4635  micro avg 0.74 0.73 0.74 14146  macro avg 0.74 0.73 0.74 14146 weighted avg 0.74 0.73 0.74 14146
192,0.4513,0.369968,0.723576,0.708127,0.739714,0.723576,precision recall f1-score support  LOC 0.78 0.71 0.74 4834  ORG 0.66 0.63 0.64 4677  PER 0.84 0.85 0.84 4635  micro avg 0.76 0.73 0.74 14146  macro avg 0.76 0.73 0.74 14146 weighted avg 0.76 0.73 0.74 14146
216,0.3786,0.366571,0.730411,0.71249,0.749258,0.730411,precision recall f1-score support  LOC 0.74 0.80 0.77 4834  ORG 0.67 0.61 0.64 4677  PER 0.88 0.81 0.84 4635  micro avg 0.76 0.74 0.75 14146  macro avg 0.77 0.74 0.75 14146 weighted avg 0.77 0.74 0.75 14146
240,0.425,0.365239,0.704618,0.687374,0.722748,0.704618,precision recall f1-score support  LOC 0.74 0.72 0.73 4834  ORG 0.64 0.58 0.61 4677  PER 0.83 0.86 0.84 4635  micro avg 0.74 0.72 0.73 14146  macro avg 0.74 0.72 0.73 14146 weighted avg 0.74 0.72 0.73 14146


TrainOutput(global_step=2502, training_loss=0.26952576098872794, metrics={'train_runtime': 1164.6892, 'train_samples_per_second': 51.516, 'train_steps_per_second': 2.148, 'total_flos': 1178415902403696.0, 'train_loss': 0.26952576098872794, 'epoch': 3.0})

In [32]:
trainer.push_to_hub(commit_message="Training completed!")

Adding files tracked by Git LFS: ['tokenizer.json']. This may take a bit of time if the files are large.


Upload file pytorch_model.bin:   0%|          | 1.00/1.03G [00:00<?, ?B/s]

Upload file tokenizer.json:   0%|          | 1.00/16.3M [00:00<?, ?B/s]

Upload file training_args.bin:   0%|          | 1.00/3.50k [00:00<?, ?B/s]

Upload file runs/Apr30_08-22-35_69492b803129/events.out.tfevents.1682843463.69492b803129.919.0:   0%|         …

Upload file runs/Apr30_08-22-35_69492b803129/1682843463.5521657/events.out.tfevents.1682843463.69492b803129.91…

Upload file runs/Apr30_08-34-57_69492b803129/events.out.tfevents.1682843719.69492b803129.919.2:   0%|         …

Upload file runs/Apr30_08-34-57_69492b803129/1682843719.5457287/events.out.tfevents.1682843719.69492b803129.91…

To https://huggingface.co/maren-hugg/xlm-roberta-base-finetuned-panx-en
   2738258..4e18f14  main -> main

   2738258..4e18f14  main -> main

To https://huggingface.co/maren-hugg/xlm-roberta-base-finetuned-panx-en
   4e18f14..291ed0b  main -> main

   4e18f14..291ed0b  main -> main



'https://huggingface.co/maren-hugg/xlm-roberta-base-finetuned-panx-en/commit/4e18f14bd1a60c1ade4ef6c90149a90b37164380'

In [None]:
# info about trainer log history
metrics_train=trainer.state.log_history
for item in metrics_train:
  print(item)