For details, see the original notebook on `LoRA_for_token_classifcation` [[link](https://github.com/matthiasdroth/Compute-Optimal_LoRA-Adapters_for_Language_Models/blob/main/LoRA_for_token_classification.ipynb)].

ToDo:
- change model to `FacebookAI/roberta-large`
- change dataset to `DFKI-SLT/few-nerd`
- change training to `accelerate`
- count FLOPs via `einops` in training loop
- add logic to find maximum batch size
- add basic sweep and log to wandb

In [1]:
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import torch
import numpy as np

checkpoint = "FacebookAI/roberta-large"
model = AutoModelForTokenClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_prefix_space=True)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
fewnerd = load_dataset("DFKI-SLT/few-nerd", "supervised")
fewnerd_all = concatenate_datasets([fewnerd["train"], fewnerd["validation"], fewnerd["test"]])
#fewnerd_all = fewnerd_all.rename_column("tokens", "text")
fewnerd_all = fewnerd_all.rename_column("tokens", "words")
fewnerd_all

Dataset({
    features: ['id', 'words', 'ner_tags', 'fine_ner_tags'],
    num_rows: 188239
})

In [3]:
i = 22
fewnerd_all[i]["words"]

['Known',
 'locally',
 'as',
 '``',
 'Fairbottom',
 'Bobs',
 '``',
 'it',
 'is',
 'now',
 'preserved',
 'at',
 'the',
 'Henry',
 'Ford',
 'Museum',
 'in',
 'Dearborn',
 ',',
 'Michigan',
 '.']

In [4]:
fewnerd_all[i]["ner_tags"]

[0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 0, 4, 0]

In [5]:
len(fewnerd_all[i]["words"]), len(fewnerd_all[i]["ner_tags"])

(21, 21)

In [6]:
label_names = fewnerd_all.features["ner_tags"].feature.names
label_names

['O',
 'art',
 'building',
 'event',
 'location',
 'organization',
 'other',
 'person',
 'product']

In [7]:
i = 22
words = fewnerd_all[i]["words"]
labels = fewnerd_all[i]["ner_tags"]
assert len(words)==len(labels)
# https://github.com/matthiasdroth/Huggingface-course/blob/main/7.2-Token_classification.ipynb
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)

Known locally as `` Fairbottom Bobs    `` it is now preserved at the Henry    Ford     Museum   in Dearborn , Michigan . 
O     O       O  O  product    product O  O  O  O   O         O  O   building building building O  location O location O 


In [8]:
tokenizer

RobertaTokenizerFast(name_or_path='FacebookAI/roberta-large', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}

In [9]:
tokenizer.is_fast

True

In [10]:
i = 22
words = fewnerd_all[i]["words"]
labels = fewnerd_all[i]["ner_tags"]
assert len(words)==len(labels)
for j in range(len(words)):
    print(f"label:\t{labels[j]}\tword:\t{words[j]}")

label:	0	word:	Known
label:	0	word:	locally
label:	0	word:	as
label:	0	word:	``
label:	8	word:	Fairbottom
label:	8	word:	Bobs
label:	0	word:	``
label:	0	word:	it
label:	0	word:	is
label:	0	word:	now
label:	0	word:	preserved
label:	0	word:	at
label:	0	word:	the
label:	2	word:	Henry
label:	2	word:	Ford
label:	2	word:	Museum
label:	0	word:	in
label:	4	word:	Dearborn
label:	0	word:	,
label:	4	word:	Michigan
label:	0	word:	.


In [11]:
len(words), len(labels) # tokenizing idx_tokens results in a longer list => adapt labels accordingly

(21, 21)

In [12]:
i = 22
inputs = tokenizer(fewnerd_all[i]["words"], is_split_into_words=True)
tokens = inputs.tokens()
word_ids = inputs.word_ids()
input_ids = inputs.input_ids
attention_mask = inputs.attention_mask
assert len(tokens)==len(word_ids) and len(tokens)==len(input_ids) and len(tokens)==len(attention_mask)
len(tokens), len(labels)

(26, 21)

In [13]:
word_ids, len(word_ids)

([None,
  0,
  1,
  2,
  3,
  4,
  4,
  5,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  17,
  18,
  19,
  20,
  None],
 26)

In [14]:
import copy
#labels_cp = copy.copy(labels)
simple_word_ids = copy.copy(word_ids[1:-1])
simple_tokens = copy.copy(tokens[1:-1])

In [15]:
simple_tokens

['ĠKnown',
 'Ġlocally',
 'Ġas',
 'Ġ``',
 'ĠFair',
 'bottom',
 'ĠBob',
 's',
 'Ġ``',
 'Ġit',
 'Ġis',
 'Ġnow',
 'Ġpreserved',
 'Ġat',
 'Ġthe',
 'ĠHenry',
 'ĠFord',
 'ĠMuseum',
 'Ġin',
 'ĠDear',
 'born',
 'Ġ,',
 'ĠMichigan',
 'Ġ.']

In [16]:
simple_word_ids

[0,
 1,
 2,
 3,
 4,
 4,
 5,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 17,
 18,
 19,
 20]

In [17]:
previous_word_id = False
previous_label = False
label_index = 0
match_labels = []
for i in range(len(simple_word_ids)):
    word_id = simple_word_ids[i]
    if word_id==previous_word_id:
        # add previous_label
        match_labels.append(int(previous_label)) # int(False) = 0 (evaluates False to 0)
    else:
        # incremente label_index
        label_index += 1
        # get label via label_index
        label = labels[label_index]
        # add label to match_labels
        match_labels.append(label)
    # update previous_word_id
    previous_word_id = word_id
    # update previous_label
    previous_label = label
    # logs
    print(f"i={i} \t word_id={word_id} \t label={label} \t token={tokens[i+1]}")

full_match_labels = [-100] + match_labels + [-100]
full_match_labels

i=0 	 word_id=0 	 label=0 	 token=ĠKnown
i=1 	 word_id=1 	 label=0 	 token=Ġlocally
i=2 	 word_id=2 	 label=0 	 token=Ġas
i=3 	 word_id=3 	 label=0 	 token=Ġ``
i=4 	 word_id=4 	 label=8 	 token=ĠFair
i=5 	 word_id=4 	 label=8 	 token=bottom
i=6 	 word_id=5 	 label=8 	 token=ĠBob
i=7 	 word_id=5 	 label=8 	 token=s
i=8 	 word_id=6 	 label=0 	 token=Ġ``
i=9 	 word_id=7 	 label=0 	 token=Ġit
i=10 	 word_id=8 	 label=0 	 token=Ġis
i=11 	 word_id=9 	 label=0 	 token=Ġnow
i=12 	 word_id=10 	 label=0 	 token=Ġpreserved
i=13 	 word_id=11 	 label=0 	 token=Ġat
i=14 	 word_id=12 	 label=0 	 token=Ġthe
i=15 	 word_id=13 	 label=2 	 token=ĠHenry
i=16 	 word_id=14 	 label=2 	 token=ĠFord
i=17 	 word_id=15 	 label=2 	 token=ĠMuseum
i=18 	 word_id=16 	 label=0 	 token=Ġin
i=19 	 word_id=17 	 label=4 	 token=ĠDear
i=20 	 word_id=17 	 label=4 	 token=born
i=21 	 word_id=18 	 label=0 	 token=Ġ,
i=22 	 word_id=19 	 label=4 	 token=ĠMichigan
i=23 	 word_id=20 	 label=0 	 token=Ġ.


[-100,
 0,
 0,
 0,
 0,
 8,
 8,
 8,
 8,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 4,
 4,
 0,
 4,
 0,
 -100]

Load the [**fewnerd**](https://arxiv.org/pdf/2105.07464v6.pdf) dataset and read the according [**publication**](https://aclanthology.org/2021.acl-long.248/)!

In [18]:
from datasets import DatasetDict
# 0.1
# 100 * 0.1 = 10 => 10 for test, 90 for train + valid; 90 * 0.1 = 9 for valid => 81 for train
# 0.15
# 100 * 0.15 = 15 => 15 for test, 85 for train + valid; 85 * 0.15 = 12.75 for valid => 72.75 for train
# 0.2
# 100 * 0.2 = 20 => 20 for test, 80 for train + valid; 80 * 0.2 = 16 for valid => 64 for train
dataset_cc = concatenate_datasets([fewnerd["train"], fewnerd["validation"], fewnerd["test"]])
dev_split = dataset_cc.train_test_split(test_size=4)["test"]
trainvalid_test_splits = dataset_cc.train_test_split(test_size=0.15) # train 81% valid 9% test 10%
test_split = trainvalid_test_splits["test"]
trainvalid_split = trainvalid_test_splits["train"]
train_valid_split = trainvalid_split.train_test_split(test_size=0.15)
valid_split = train_valid_split["test"]
train_split = train_valid_split["train"]
dataset_fewnerd = DatasetDict({
    "train": train_split,
    "valid": valid_split,
    "test": test_split,
    "dev": dev_split
}).remove_columns(["id", "ner_tags"])
dataset_fewnerd

DatasetDict({
    train: Dataset({
        features: ['tokens', 'fine_ner_tags'],
        num_rows: 136002
    })
    valid: Dataset({
        features: ['tokens', 'fine_ner_tags'],
        num_rows: 24001
    })
    test: Dataset({
        features: ['tokens', 'fine_ner_tags'],
        num_rows: 28236
    })
    dev: Dataset({
        features: ['tokens', 'fine_ner_tags'],
        num_rows: 4
    })
})

<font style="font-weight:300">✔</font>