In [1]:
from transformers import AutoTokenizer

bert_model_name = "bert-base-cased"
xlmr_model_name = "xlm-roberta-base"

bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
text = "Mirza loves Dhaka city"
bert_tokenizer(text).tokens()

['[CLS]', 'Mirza', 'loves', 'Dhaka', 'city', '[SEP]']

In [3]:
xlmr_tokenizer(text).tokens()

['<s>', '▁Mirza', '▁love', 's', '▁Dhaka', '▁city', '</s>']

### The Tokenizer Pipeline

Normalization: removing whitspaces(stripping), lower case characters (mirza loves dhaka city)

Pretokenization: split the text into words ('mirza', 'loves', 'dhaka', 'city')

Tokenizer Model: spliting words into subwords using BPE/Unigram/WordPiece ('mir', 'za', 'loves', 'dhaka', 'city')

Postprocessing: adding special tokens ('CLS', 'mir', 'za', 'loves', 'dhaka', 'city', 'SEP')

### The SentencePiece Tokenizer

In [4]:
xlmr_token = xlmr_tokenizer(text).tokens()
"".join(xlmr_token).replace(u"\u2581", " ")

'<s> Mirza loves Dhaka city</s>'

### Transformers For NER: Creating Cutom Model for Token Classification

In [5]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config, add_pooling_layer=False) # load model body
        self.dropout = nn.Dropout(config.hidden_dropout_prob) # set up token classification head
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights() # load and initialize weights

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,labels=None, **kwargs):
        outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, **kwargs)
        sequence_output = self.dropout(outputs[0]) # apply classifier to encoder representation
        logits = self.classifier(sequence_output)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return TokenClassifierOutput(loss=loss, logits=logits,
                                     hidden_states=outputs.hidden_states,
                                     attentions=outputs.attentions)

### Loading a Custom Model

In [6]:
from datasets import load_dataset
from collections import defaultdict
from datasets import DatasetDict

load_dataset("xtreme", name="PAN-X.de")

langs = ["de", "fr", "it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059] # imbalanced: to make it more realistic

panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
    for split in ds:
        panx_ch[lang][split] = (
            ds[split].shuffle(seed=0).select(range(int(frac*ds[split].num_rows)))
        )

tags = panx_ch["de"]["train"].features["ner_tags"].feature
tags

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)

In [7]:
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [8]:
from transformers import AutoConfig

xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                         num_labels=tags.num_classes,
                                         id2label=index2tag, label2id=tag2index)



In [9]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model = (XLMRobertaForTokenClassification
              .from_pretrained(xlmr_model_name, config=xlmr_config)
              .to(device))

torch.cuda.empty_cache()

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import pandas as pd

input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")
pd.DataFrame([xlmr_token, input_ids[0].numpy()], index=["Toekns", "Input IDs"])

Unnamed: 0,0,1,2,3,4,5,6
Toekns,<s>,▁Mirza,▁love,s,▁Dhaka,▁city,</s>
Input IDs,0,187279,5161,7,111617,26349,2


In [11]:
outputs = xlmr_model(input_ids.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)

print(f"Number of tokens in sequence: {len(xlmr_token)}")
print(f"Shape of output:{outputs.shape}")

Number of tokens in sequence: 7
Shape of output:torch.Size([1, 7, 7])


In [12]:
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_token, preds], index=["Tokens","Tags"])

Unnamed: 0,0,1,2,3,4,5,6
Tokens,<s>,▁Mirza,▁love,s,▁Dhaka,▁city,</s>
Tags,B-LOC,B-PER,B-PER,B-PER,B-PER,B-PER,B-PER


In [13]:
def tag_text(text, tags, model, tokenizer):
    tokens = tokenizer(text).tokens()
    input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
    outputs = model(input_ids)[0]
    predictions = torch.argmax(outputs, dim=2)
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([xlmr_token, preds], index=["Tokens","Tags"])

### Tokenizing Texts for NER

In [14]:
# function(examples: Dict[str,List]) -> Dict[str,List]

In [15]:
def label_int2str(row):
    return panx_ch["train"].features["label"].int2str(row)

In [16]:
def create_tag_names(batch):
    return {"ner_tags_str":[tags.int2str(idx) for idx in batch["ner_tags"]]}

In [17]:
panx_de = panx_ch["de"].map(create_tag_names)
de_example = panx_de["train"][0]

words, labels = de_example['tokens'], de_example['ner_tags']

In [18]:
tokenized_input = xlmr_tokenizer(de_example["tokens"], is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
pd.DataFrame([tokens], index=["Tokens"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>


In [19]:
word_ids = tokenized_input.word_ids()
pd.DataFrame([tokens, word_ids], index=["Toekns", "Word IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Toekns,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,


In [20]:
previous_word_idx = None
label_ids = []

for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
    elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])

    previous_word_idx = word_idx

labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["Toekns", "Word IDS", "Label IDs", "Labels"]

pd.DataFrame([tokens, word_ids, label_ids, labels], index=index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Toekns,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDS,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,
Label IDs,-100,0,0,-100,0,0,5,-100,-100,6,...,5,-100,-100,-100,6,-100,-100,0,-100,-100
Labels,IGN,O,O,IGN,O,O,B-LOC,IGN,IGN,I-LOC,...,B-LOC,IGN,IGN,IGN,I-LOC,IGN,IGN,O,IGN,IGN


In [21]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True,
                                      is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [22]:
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched=True,
                      remove_columns=['langs', 'ner_tags', 'tokens'])

In [23]:
panx_de_encoded = encode_panx_dataset(panx_ch["de"])

Map:   0%|          | 0/6290 [00:00<?, ? examples/s]

### Performance Measures

In [24]:
import numpy as np

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            if label_ids[batch_idx, seq_idx] != -100: # ignore label IDs = -100
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

### Fine-Tuning XLM-RoBERTa

In [25]:
from transformers import TrainingArguments

num_epochs = 2
batch_size = 8
logging_steps = len(panx_de_encoded["train"]) // batch_size
model_name = f"{xlmr_model_name}-finetuned-panx-de"

training_args = TrainingArguments(
    output_dir=model_name,
    log_level="error",
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    save_steps=1e6,
    weight_decay=0.01,
    disable_tqdm=False,
    logging_steps=logging_steps,
    gradient_accumulation_steps=2,
    push_to_hub=True
    )



In [26]:
from huggingface_hub import login
login("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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [27]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,
                                       eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

In [28]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [29]:
def model_init():
    return (XLMRobertaForTokenClassification
            .from_pretrained(xlmr_model_name, config=xlmr_config)
            .to(device))

In [30]:
from transformers import Trainer

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    train_dataset=panx_de_encoded["train"],
    eval_dataset=panx_de_encoded["validation"],
    tokenizer=xlmr_tokenizer
    )

trainer.train()

Epoch,Training Loss,Validation Loss,F1
0,No log,0.156799,0.824563


Epoch,Training Loss,Validation Loss,F1
0,No log,0.156799,0.824563
1,0.191100,0.137517,0.855347


training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1720436975.74e446d7bca6.5930.0:   0%|          | 0.00/6.37k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mirzaaa10/xlm-roberta-base-finetuned-panx-de/commit/58af2bafdc7f984006a777927303a7d14d11b5a5', commit_message='Training Complete', commit_description='', oid='58af2bafdc7f984006a777927303a7d14d11b5a5', pr_url=None, pr_revision=None, pr_num=None)

In [31]:
# let's check the model
text_de = "Mirza ist ein Informatiker bei Google in Kalifornier"
tag_text(text_de, tags, xlmr_model, xlmr_tokenizer)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,<s>,▁Mirza,▁love,s,▁Dhaka,▁city,</s>,,,,,
Tags,B-LOC,B-PER,B-PER,B-PER,B-PER,B-PER,B-PER,B-PER,B-PER,B-PER,B-PER,B-LOC


***Error Analysis***

In [33]:
from torch.nn.functional import cross_entropy

def forward_pass_with_labels(batch):
  features = [dict(zip(batch,t)) for t in zip(*batch.values())]
  batch = data_collator(features)
  input_ids = batch["input_ids"].to(device)
  attention_mask = batch["attention_mask"].to(device)
  labels = batch["labels"].to(device)

  with torch.no_grad():
    outputs = trainer.model(input_ids, attention_mask=attention_mask)
    predicted_labels = torch.argmax(outputs.logits, axis=-1).cpu().numpy()

  loss = cross_entropy(outputs.logits.view(-1, outputs.logits.shape[-1]), labels.view(-1), reduction="none")
  loss = loss.view(len(input_ids), -1).cpu().numpy()
  return {"loss":loss, "predicted_labels":predicted_labels}

In [34]:
valid_set = panx_de_encoded["validation"]
valid_set = valid_set.map(forward_pass_with_labels, batched=True, batch_size=32)
df = valid_set.to_pandas()
df.head()

Map:   0%|          | 0/6290 [00:00<?, ? examples/s]

Unnamed: 0,input_ids,attention_mask,labels,loss,predicted_labels
0,"[0, 10699, 11, 15, 16104, 1388, 2]","[1, 1, 1, 1, 1, 1, 1]","[-100, 3, -100, 4, 4, 4, -100]","[0.0, 0.020124305, 0.0, 0.02122533, 0.01606438...","[4, 3, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[0, 56530, 25216, 30121, 152385, 19229, 83982,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, -100, -100, -100, -100, 3, -100, -10...","[0.0, 0.00032765264, 0.0, 0.0, 0.0, 0.0, 2.132...","[0, 0, 0, 0, 0, 0, 5, 6, 6, 6, 6, 6, 6, 6, 6, ..."
2,"[0, 159093, 165, 38506, 122, 153080, 29088, 57...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[-100, 0, 0, 0, 0, 3, -100, -100, 0, -100, 0, ...","[0.0, 0.00021407696, 0.0001546025, 0.000167355...","[0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[0, 16459, 242, 5106, 6, 198715, 5106, 242, 2]","[1, 1, 1, 1, 1, 1, 1, 1, 1]","[-100, 0, 0, 0, 5, -100, 0, 0, -100]","[0.0, 0.00039438574, 0.0003650714, 0.000364237...","[5, 0, 0, 0, 5, 6, 0, 0, 5, 0, 0, 0, 0, 0, 0, ..."
4,"[0, 11022, 2315, 7418, 1079, 8186, 57242, 97, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 0, 0, 0, 0, 0, -100, 0, 0, 0, 3, ...","[0.0, 0.00017796364, 0.0001677134, 0.000205734...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, ..."


In [35]:
# load the model from huggingface
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("mirzaaa10/xlm-roberta-base-finetuned-panx-de")
model = AutoModelForTokenClassification.from_pretrained("mirzaaa10/xlm-roberta-base-finetuned-panx-de")

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/988 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]