For details, see the original notebook on `LoRA_for_token_classifcation` [[link](https://github.com/matthiasdroth/Compute-Optimal_LoRA-Adapters_for_Language_Models/blob/main/LoRA_for_token_classification.ipynb)].

ToDo:
- change model to `FacebookAI/roberta-large`
- change dataset to `DFKI-SLT/few-nerd`
- change training to `accelerate`
- count FLOPs via `einops` in training loop
- add logic to find maximum batch size
- add basic sweep and log to wandb

In [1]:
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import torch
import numpy as np

checkpoint = "FacebookAI/roberta-large"
model = AutoModelForTokenClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_prefix_space=True)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
fewnerd = load_dataset("DFKI-SLT/few-nerd", "supervised")
fewnerd_all = concatenate_datasets([fewnerd["train"], fewnerd["validation"], fewnerd["test"]])
#fewnerd_all = fewnerd_all.rename_column("tokens", "text")
fewnerd_all = fewnerd_all.rename_column("tokens", "words")
fewnerd_all

Dataset({
    features: ['id', 'words', 'ner_tags', 'fine_ner_tags'],
    num_rows: 188239
})

In [3]:
i = 22
fewnerd_all[i]["words"]

['Known',
 'locally',
 'as',
 '``',
 'Fairbottom',
 'Bobs',
 '``',
 'it',
 'is',
 'now',
 'preserved',
 'at',
 'the',
 'Henry',
 'Ford',
 'Museum',
 'in',
 'Dearborn',
 ',',
 'Michigan',
 '.']

In [4]:
fewnerd_all[i]["ner_tags"]

[0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 0, 4, 0]

In [5]:
len(fewnerd_all[i]["words"]), len(fewnerd_all[i]["ner_tags"])

(21, 21)

In [6]:
label_names = fewnerd_all.features["ner_tags"].feature.names
label_names

['O',
 'art',
 'building',
 'event',
 'location',
 'organization',
 'other',
 'person',
 'product']

In [7]:
i = 22
words = fewnerd_all[i]["words"]
labels = fewnerd_all[i]["ner_tags"]
assert len(words)==len(labels)
# https://github.com/matthiasdroth/Huggingface-course/blob/main/7.2-Token_classification.ipynb
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)

Known locally as `` Fairbottom Bobs    `` it is now preserved at the Henry    Ford     Museum   in Dearborn , Michigan . 
O     O       O  O  product    product O  O  O  O   O         O  O   building building building O  location O location O 


In [8]:
tokenizer

RobertaTokenizerFast(name_or_path='FacebookAI/roberta-large', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}

In [9]:
tokenizer.is_fast

True

In [10]:
i = 22
words = fewnerd_all[i]["words"]
labels = fewnerd_all[i]["ner_tags"]
assert len(words)==len(labels)
for j in range(len(words)):
    print(f"label:\t{labels[j]}\tword:\t{words[j]}")

label:	0	word:	Known
label:	0	word:	locally
label:	0	word:	as
label:	0	word:	``
label:	8	word:	Fairbottom
label:	8	word:	Bobs
label:	0	word:	``
label:	0	word:	it
label:	0	word:	is
label:	0	word:	now
label:	0	word:	preserved
label:	0	word:	at
label:	0	word:	the
label:	2	word:	Henry
label:	2	word:	Ford
label:	2	word:	Museum
label:	0	word:	in
label:	4	word:	Dearborn
label:	0	word:	,
label:	4	word:	Michigan
label:	0	word:	.


In [11]:
len(words), len(labels) # tokenizing idx_tokens results in a longer list => adapt labels accordingly

(21, 21)

In [12]:
i = 22
inputs = tokenizer(fewnerd_all[i]["words"], is_split_into_words=True)
tokens = inputs.tokens()
word_ids = inputs.word_ids()
input_ids = inputs.input_ids
attention_mask = inputs.attention_mask
assert len(tokens)==len(word_ids) and len(tokens)==len(input_ids) and len(tokens)==len(attention_mask)
len(tokens), len(labels)

(26, 21)

In [13]:
word_ids, len(word_ids)

([None,
  0,
  1,
  2,
  3,
  4,
  4,
  5,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  17,
  18,
  19,
  20,
  None],
 26)

In [14]:
import copy
labels_cp = copy.copy(labels)

In [15]:
# process only vanilla tokens, not <s> and </s> tokens
simple_word_ids = copy.copy(word_ids)[1:-1]
len(simple_word_ids), simple_word_ids[0], simple_word_ids[-1]

(24, 0, 20)

In [16]:
simple_tokens = tokens[1:-1]
len(simple_tokens), simple_tokens[0], simple_tokens[-1]

(24, 'ĠKnown', 'Ġ.')

In [17]:
simple_tokens

['ĠKnown',
 'Ġlocally',
 'Ġas',
 'Ġ``',
 'ĠFair',
 'bottom',
 'ĠBob',
 's',
 'Ġ``',
 'Ġit',
 'Ġis',
 'Ġnow',
 'Ġpreserved',
 'Ġat',
 'Ġthe',
 'ĠHenry',
 'ĠFord',
 'ĠMuseum',
 'Ġin',
 'ĠDear',
 'born',
 'Ġ,',
 'ĠMichigan',
 'Ġ.']

In [18]:
simple_word_ids

[0,
 1,
 2,
 3,
 4,
 4,
 5,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 17,
 18,
 19,
 20]

In [19]:
labels

[0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 0, 4, 0]

In [20]:
tokens

['<s>',
 'ĠKnown',
 'Ġlocally',
 'Ġas',
 'Ġ``',
 'ĠFair',
 'bottom',
 'ĠBob',
 's',
 'Ġ``',
 'Ġit',
 'Ġis',
 'Ġnow',
 'Ġpreserved',
 'Ġat',
 'Ġthe',
 'ĠHenry',
 'ĠFord',
 'ĠMuseum',
 'Ġin',
 'ĠDear',
 'born',
 'Ġ,',
 'ĠMichigan',
 'Ġ.',
 '</s>']

In [29]:
previous_word_id = False
previous_label = False
label_index = 0
match_labels = []
for i in range(len(simple_word_ids)):
    word_id = simple_word_ids[i]
    if i==0:
        match_labels.append(labels[0])
    if word_id==previous_word_id and i>0:# and type(previous_word_id)==int: # second condition handles "False==0"
        print()
        # add previous_label
        match_labels.append(previous_label)
    else:
        # incremente label_index
        label_index += 1
        # get label via label_index
        label = labels[label_index]
        # add label to match_labels
        match_labels.append(label)
    # update previous_word_id
    previous_word_id = word_id
    # update previous_label
    previous_label = label
    # logs
    print(f"i={i} \t word_id={word_id} \t label={label} \t token={tokens[i+1]}")

i=0 	 word_id=0 	 label=0 	 token=ĠKnown
i=1 	 word_id=1 	 label=0 	 token=Ġlocally
i=2 	 word_id=2 	 label=0 	 token=Ġas
i=3 	 word_id=3 	 label=8 	 token=Ġ``
i=4 	 word_id=4 	 label=8 	 token=ĠFair

i=5 	 word_id=4 	 label=8 	 token=bottom
i=6 	 word_id=5 	 label=0 	 token=ĠBob

i=7 	 word_id=5 	 label=0 	 token=s
i=8 	 word_id=6 	 label=0 	 token=Ġ``
i=9 	 word_id=7 	 label=0 	 token=Ġit
i=10 	 word_id=8 	 label=0 	 token=Ġis
i=11 	 word_id=9 	 label=0 	 token=Ġnow
i=12 	 word_id=10 	 label=0 	 token=Ġpreserved
i=13 	 word_id=11 	 label=0 	 token=Ġat
i=14 	 word_id=12 	 label=2 	 token=Ġthe
i=15 	 word_id=13 	 label=2 	 token=ĠHenry
i=16 	 word_id=14 	 label=2 	 token=ĠFord
i=17 	 word_id=15 	 label=0 	 token=ĠMuseum
i=18 	 word_id=16 	 label=4 	 token=Ġin
i=19 	 word_id=17 	 label=0 	 token=ĠDear

i=20 	 word_id=17 	 label=0 	 token=born
i=21 	 word_id=18 	 label=4 	 token=Ġ,
i=22 	 word_id=19 	 label=0 	 token=ĠMichigan


IndexError: list index out of range

In [35]:
previous_word_id = False
previous_label = False
label_index = 0
match_labels = []
for i in range(len(simple_word_ids)):
    word_id = simple_word_ids[i]
    if word_id==previous_word_id:
        # add previous_label
        match_labels.append(int(previous_label)) # int(False) = 0 evaluates False to 0
    else:
        # incremente label_index
        label_index += 1
        # get label via label_index
        label = labels[label_index]
        # add label to match_labels
        match_labels.append(label)
    # update previous_word_id
    previous_word_id = word_id
    # update previous_label
    previous_label = label
    # logs
    print(f"i={i} \t word_id={word_id} \t label={label} \t token={tokens[i+1]}")

full_match_labels = [-100] + match_labels + [-100]
full_match_labels

i=0 	 word_id=0 	 label=0 	 token=ĠKnown
i=1 	 word_id=1 	 label=0 	 token=Ġlocally
i=2 	 word_id=2 	 label=0 	 token=Ġas
i=3 	 word_id=3 	 label=0 	 token=Ġ``
i=4 	 word_id=4 	 label=8 	 token=ĠFair
i=5 	 word_id=4 	 label=8 	 token=bottom
i=6 	 word_id=5 	 label=8 	 token=ĠBob
i=7 	 word_id=5 	 label=8 	 token=s
i=8 	 word_id=6 	 label=0 	 token=Ġ``
i=9 	 word_id=7 	 label=0 	 token=Ġit
i=10 	 word_id=8 	 label=0 	 token=Ġis
i=11 	 word_id=9 	 label=0 	 token=Ġnow
i=12 	 word_id=10 	 label=0 	 token=Ġpreserved
i=13 	 word_id=11 	 label=0 	 token=Ġat
i=14 	 word_id=12 	 label=0 	 token=Ġthe
i=15 	 word_id=13 	 label=2 	 token=ĠHenry
i=16 	 word_id=14 	 label=2 	 token=ĠFord
i=17 	 word_id=15 	 label=2 	 token=ĠMuseum
i=18 	 word_id=16 	 label=0 	 token=Ġin
i=19 	 word_id=17 	 label=4 	 token=ĠDear
i=20 	 word_id=17 	 label=4 	 token=born
i=21 	 word_id=18 	 label=0 	 token=Ġ,
i=22 	 word_id=19 	 label=4 	 token=ĠMichigan
i=23 	 word_id=20 	 label=0 	 token=Ġ.


[-100,
 0,
 0,
 0,
 0,
 8,
 8,
 8,
 8,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 4,
 4,
 0,
 4,
 0,
 -100]

In [37]:
len(full_match_labels), len(tokens)

(26, 26)

In [39]:
for i in range(len(tokens)):
    token = tokens[i]
    label = full_match_labels[i]
    print(f"label {label} \ttoken {token}")

label -100 	token <s>
label 0 	token ĠKnown
label 0 	token Ġlocally
label 0 	token Ġas
label 0 	token Ġ``
label 8 	token ĠFair
label 8 	token bottom
label 8 	token ĠBob
label 8 	token s
label 0 	token Ġ``
label 0 	token Ġit
label 0 	token Ġis
label 0 	token Ġnow
label 0 	token Ġpreserved
label 0 	token Ġat
label 0 	token Ġthe
label 2 	token ĠHenry
label 2 	token ĠFord
label 2 	token ĠMuseum
label 0 	token Ġin
label 4 	token ĠDear
label 4 	token born
label 0 	token Ġ,
label 4 	token ĠMichigan
label 0 	token Ġ.
label -100 	token </s>


In [22]:
previous_word_id = False
previous_label = False
label_index = 0
match_labels = []
for i in range(len(simple_word_ids)):
    word_id = simple_word_ids[i]
    if word_id==previous_word_id:
        # add previous_label
        match_labels.append(previous_label)
    else:
        # incremente label_index
        label_index += 1
        # get label via label_index
        label = labels[label_index]
        # add label to match_labels
        match_labels.append(label)
    # update previous_word_id
    previous_word_id = word_id
    # update previous_label
    previous_label = label
    # logs
    print(f"i={i} \t word_id={word_id} \t label={label} \t token={tokens[i+1]}")

full_match_labels = [-100] + match_labels + [-100]
full_match_labels

i=0 	 word_id=0 	 label=0 	 token=ĠKnown
i=1 	 word_id=1 	 label=0 	 token=Ġlocally
i=2 	 word_id=2 	 label=0 	 token=Ġas
i=3 	 word_id=3 	 label=0 	 token=Ġ``
i=4 	 word_id=4 	 label=8 	 token=ĠFair
i=5 	 word_id=4 	 label=8 	 token=bottom
i=6 	 word_id=5 	 label=8 	 token=ĠBob
i=7 	 word_id=5 	 label=8 	 token=s
i=8 	 word_id=6 	 label=0 	 token=Ġ``
i=9 	 word_id=7 	 label=0 	 token=Ġit
i=10 	 word_id=8 	 label=0 	 token=Ġis
i=11 	 word_id=9 	 label=0 	 token=Ġnow
i=12 	 word_id=10 	 label=0 	 token=Ġpreserved
i=13 	 word_id=11 	 label=0 	 token=Ġat
i=14 	 word_id=12 	 label=0 	 token=Ġthe
i=15 	 word_id=13 	 label=2 	 token=ĠHenry
i=16 	 word_id=14 	 label=2 	 token=ĠFord
i=17 	 word_id=15 	 label=2 	 token=ĠMuseum
i=18 	 word_id=16 	 label=0 	 token=Ġin
i=19 	 word_id=17 	 label=4 	 token=ĠDear
i=20 	 word_id=17 	 label=4 	 token=born
i=21 	 word_id=18 	 label=0 	 token=Ġ,
i=22 	 word_id=19 	 label=4 	 token=ĠMichigan
i=23 	 word_id=20 	 label=0 	 token=Ġ.


[-100,
 False,
 0,
 0,
 0,
 8,
 8,
 8,
 8,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 4,
 4,
 0,
 4,
 0,
 -100]

In [38]:
labels[0]

0

In [46]:
type(0), type(False)==bool

(int, True)

In [42]:
0==False

True

In [39]:
previous_word_id = False
previous_label = False
label_index = 0
match_labels = []
for i in range(len(simple_word_ids)):
    word_id = simple_word_ids[i]
    if word_id==previous_word_id:
        # add previous_label
        match_labels.append(previous_label)
    else:
        # incremente label_index
        label_index += 1
        # get label via label_index
        label = labels[label_index]
        # add label to match_labels
        match_labels.append(label)
    # update previous_word_id
    previous_word_id = word_id
    # update previous_label
    previous_label = label
    # logs
    print(f"i={i} \t word_id={word_id} \t label={label} \t token={tokens[i+1]}")

full_match_labels = [-100] + match_labels + [-100]
full_match_labels

i=0 	 word_id=0 	 label=0 	 token=ĠKnown
i=1 	 word_id=1 	 label=0 	 token=Ġlocally
i=2 	 word_id=2 	 label=0 	 token=Ġas
i=3 	 word_id=3 	 label=0 	 token=Ġ``
i=4 	 word_id=4 	 label=8 	 token=ĠFair
i=5 	 word_id=4 	 label=8 	 token=bottom
i=6 	 word_id=5 	 label=8 	 token=ĠBob
i=7 	 word_id=5 	 label=8 	 token=s
i=8 	 word_id=6 	 label=0 	 token=Ġ``
i=9 	 word_id=7 	 label=0 	 token=Ġit
i=10 	 word_id=8 	 label=0 	 token=Ġis
i=11 	 word_id=9 	 label=0 	 token=Ġnow
i=12 	 word_id=10 	 label=0 	 token=Ġpreserved
i=13 	 word_id=11 	 label=0 	 token=Ġat
i=14 	 word_id=12 	 label=0 	 token=Ġthe
i=15 	 word_id=13 	 label=2 	 token=ĠHenry
i=16 	 word_id=14 	 label=2 	 token=ĠFord
i=17 	 word_id=15 	 label=2 	 token=ĠMuseum
i=18 	 word_id=16 	 label=0 	 token=Ġin
i=19 	 word_id=17 	 label=4 	 token=ĠDear
i=20 	 word_id=17 	 label=4 	 token=born
i=21 	 word_id=18 	 label=0 	 token=Ġ,
i=22 	 word_id=19 	 label=4 	 token=ĠMichigan
i=23 	 word_id=20 	 label=0 	 token=Ġ.


[-100,
 None,
 0,
 0,
 0,
 8,
 8,
 8,
 8,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 4,
 4,
 0,
 4,
 0,
 -100]

In [32]:
len(full_match_labels)

26

In [47]:
tokens

['<s>',
 'ĠKnown',
 'Ġlocally',
 'Ġas',
 'Ġ``',
 'ĠFair',
 'bottom',
 'ĠBob',
 's',
 'Ġ``',
 'Ġit',
 'Ġis',
 'Ġnow',
 'Ġpreserved',
 'Ġat',
 'Ġthe',
 'ĠHenry',
 'ĠFord',
 'ĠMuseum',
 'Ġin',
 'ĠDear',
 'born',
 'Ġ,',
 'ĠMichigan',
 'Ġ.',
 '</s>']

In [45]:
label_index = 0
previous_word_id = False
matched_labels_partial = []
for i in range(len(simple_word_ids)):
    word_id = simple_word_ids[i]
    label = labels_cp[label_index]
    new_word = word_id!=previous_word_id
    if new_word:
        label_index += 1
    matched_labels_partial.append(label)
    print(f"i={i} \tlabel_index={label_index} \tlabel={label} \tnew_word={new_word} \tword_id={word_id} \ttoken={tokens[i]}")
    previous_word_id = word_id

len(matched_labels_partial)

i=0 	label_index=0 	label=0 	new_word=False 	word_id=0 	token=<s>
i=1 	label_index=1 	label=0 	new_word=True 	word_id=1 	token=ĠKnown
i=2 	label_index=2 	label=0 	new_word=True 	word_id=2 	token=Ġlocally
i=3 	label_index=3 	label=0 	new_word=True 	word_id=3 	token=Ġas
i=4 	label_index=4 	label=0 	new_word=True 	word_id=4 	token=Ġ``
i=5 	label_index=4 	label=8 	new_word=False 	word_id=4 	token=ĠFair
i=6 	label_index=5 	label=8 	new_word=True 	word_id=5 	token=bottom
i=7 	label_index=5 	label=8 	new_word=False 	word_id=5 	token=ĠBob
i=8 	label_index=6 	label=8 	new_word=True 	word_id=6 	token=s
i=9 	label_index=7 	label=0 	new_word=True 	word_id=7 	token=Ġ``
i=10 	label_index=8 	label=0 	new_word=True 	word_id=8 	token=Ġit
i=11 	label_index=9 	label=0 	new_word=True 	word_id=9 	token=Ġis
i=12 	label_index=10 	label=0 	new_word=True 	word_id=10 	token=Ġnow
i=13 	label_index=11 	label=0 	new_word=True 	word_id=11 	token=Ġpreserved
i=14 	label_index=12 	label=0 	new_word=True 	word_id=12 	t

24

In [42]:
label_index = 0
previous_word_id = False
matched_labels = []
for i in range(len(simple_word_ids)):
    word_id = simple_word_ids[i]
    label = labels_cp[label_index]
    new_word = word_id!=previous_word_id
    if new_word:
        label_index += 1
    matched_labels.append(label)
    print(f"i={i} \tlabel_index={label_index} \tlabel={label} \tnew_word={new_word} \tword_id={word_id} \ttoken={tokens[i]}")
    previous_word_id = word_id

len(matched_labels)

i=0 	label_index=0 	label=0 	new_word=False 	word_id=0 	token=<s>
i=1 	label_index=1 	label=0 	new_word=True 	word_id=1 	token=ĠKnown
i=2 	label_index=2 	label=0 	new_word=True 	word_id=2 	token=Ġlocally
i=3 	label_index=3 	label=0 	new_word=True 	word_id=3 	token=Ġas
i=4 	label_index=4 	label=0 	new_word=True 	word_id=4 	token=Ġ``
i=5 	label_index=4 	label=8 	new_word=False 	word_id=4 	token=ĠFair
i=6 	label_index=5 	label=8 	new_word=True 	word_id=5 	token=bottom
i=7 	label_index=5 	label=8 	new_word=False 	word_id=5 	token=ĠBob
i=8 	label_index=6 	label=8 	new_word=True 	word_id=6 	token=s
i=9 	label_index=7 	label=0 	new_word=True 	word_id=7 	token=Ġ``
i=10 	label_index=8 	label=0 	new_word=True 	word_id=8 	token=Ġit
i=11 	label_index=9 	label=0 	new_word=True 	word_id=9 	token=Ġis
i=12 	label_index=10 	label=0 	new_word=True 	word_id=10 	token=Ġnow
i=13 	label_index=11 	label=0 	new_word=True 	word_id=11 	token=Ġpreserved
i=14 	label_index=12 	label=0 	new_word=True 	word_id=12 	t

24

In [43]:
matched_labels_all = [-100] + matched_labels + [-100]
matched_labels_all

[-100,
 0,
 0,
 0,
 0,
 0,
 8,
 8,
 8,
 8,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 4,
 4,
 0,
 4,
 -100]

In [44]:
len(matched_labels_all)

26

In [27]:
# match labels to word_ids: if word_id repeats, attach previous label once more to labels_matched
label_index = -1
previous_word_id = False
for i in range(len(word_ids)-1):
    word_id = word_ids[i]
    #word_id = word_id if word_id!=None else -100
    label = labels_cp[label_index]
    new_word = word_id!=previous_word_id
    if new_word:
        label_index += 1
    print(f"i={i} \tlabel_index={label_index} \tlabel={label} \tnew_word={new_word} \tword_id={word_id}\tprevious_word_id={previous_word_id} \ttoken={tokens[i]}")
    previous_word_id = word_id

i=0 	label_index=0 	label=0 	new_word=True 	word_id=None	previous_word_id=False 	token=<s>
i=1 	label_index=1 	label=0 	new_word=True 	word_id=0	previous_word_id=None 	token=ĠKnown
i=2 	label_index=2 	label=0 	new_word=True 	word_id=1	previous_word_id=0 	token=Ġlocally
i=3 	label_index=3 	label=0 	new_word=True 	word_id=2	previous_word_id=1 	token=Ġas
i=4 	label_index=4 	label=0 	new_word=True 	word_id=3	previous_word_id=2 	token=Ġ``
i=5 	label_index=5 	label=8 	new_word=True 	word_id=4	previous_word_id=3 	token=ĠFair
i=6 	label_index=5 	label=8 	new_word=False 	word_id=4	previous_word_id=4 	token=bottom
i=7 	label_index=6 	label=8 	new_word=True 	word_id=5	previous_word_id=4 	token=ĠBob
i=8 	label_index=6 	label=0 	new_word=False 	word_id=5	previous_word_id=5 	token=s
i=9 	label_index=7 	label=0 	new_word=True 	word_id=6	previous_word_id=5 	token=Ġ``
i=10 	label_index=8 	label=0 	new_word=True 	word_id=7	previous_word_id=6 	token=Ġit
i=11 	label_index=9 	label=0 	new_word=True 	word_i

In [26]:
1 in (2, 3)

False

In [22]:
# match labels to word_ids: if word_id repeats, attach previous label once more to labels_matched
label_index = -1
previous_word_id = False
for i in range(len(word_ids)):
    word_id = word_ids[i]
    label = labels_cp[label_index]
    boole = word_id!=previous_word_id or tokens[i]=="</s>"
    if boole:
        label_index += 1
    print(f"i={i} \tlabel_index={label_index} \tlabel={label} \tboole={boole} \tword_id={word_id}\tprevious_word_id={previous_word_id} \ttoken={tokens[i]}")
    previous_word_id = word_id

i=0 	label_index=0 	label=0 	boole=True 	word_id=None	previous_word_id=False 	token=<s>
i=1 	label_index=1 	label=0 	boole=True 	word_id=0	previous_word_id=None 	token=ĠKnown
i=2 	label_index=2 	label=0 	boole=True 	word_id=1	previous_word_id=0 	token=Ġlocally
i=3 	label_index=3 	label=0 	boole=True 	word_id=2	previous_word_id=1 	token=Ġas
i=4 	label_index=4 	label=0 	boole=True 	word_id=3	previous_word_id=2 	token=Ġ``
i=5 	label_index=5 	label=8 	boole=True 	word_id=4	previous_word_id=3 	token=ĠFair
i=6 	label_index=5 	label=8 	boole=False 	word_id=4	previous_word_id=4 	token=bottom
i=7 	label_index=6 	label=8 	boole=True 	word_id=5	previous_word_id=4 	token=ĠBob
i=8 	label_index=6 	label=0 	boole=False 	word_id=5	previous_word_id=5 	token=s
i=9 	label_index=7 	label=0 	boole=True 	word_id=6	previous_word_id=5 	token=Ġ``
i=10 	label_index=8 	label=0 	boole=True 	word_id=7	previous_word_id=6 	token=Ġit
i=11 	label_index=9 	label=0 	boole=True 	word_id=8	previous_word_id=7 	token=Ġis
i=

IndexError: list index out of range

In [16]:
# match labels to word_ids: if word_id repeats, attach previous label once more to labels_matched
label_index = -1
previous_word_id = False
for i in range(len(word_ids)):
    word_id = word_ids[i]
    label = labels_cp[label_index]
    boole = word_id!=previous_word_id or tokens[i]=="</s>"
    if boole:
        label_index += 1
    print(f"i={i} \tlabel_index={label_index} \tlabel={label} \tboole={boole} \tword_id={word_id}\tprevious_word_id={previous_word_id} \ttoken={tokens[i]}")
    previous_word_id = word_id

i=0 	label_index=0 	label=0 	boole=True 	word_id=None	previous_word_id=False 	token=<s>
i=1 	label_index=1 	label=0 	boole=True 	word_id=0	previous_word_id=None 	token=ĠKnown
i=2 	label_index=2 	label=0 	boole=True 	word_id=1	previous_word_id=0 	token=Ġlocally
i=3 	label_index=3 	label=0 	boole=True 	word_id=2	previous_word_id=1 	token=Ġas
i=4 	label_index=4 	label=0 	boole=True 	word_id=3	previous_word_id=2 	token=Ġ``
i=5 	label_index=5 	label=8 	boole=True 	word_id=4	previous_word_id=3 	token=ĠFair
i=6 	label_index=5 	label=8 	boole=False 	word_id=4	previous_word_id=4 	token=bottom
i=7 	label_index=6 	label=8 	boole=True 	word_id=5	previous_word_id=4 	token=ĠBob
i=8 	label_index=6 	label=0 	boole=False 	word_id=5	previous_word_id=5 	token=s
i=9 	label_index=7 	label=0 	boole=True 	word_id=6	previous_word_id=5 	token=Ġ``
i=10 	label_index=8 	label=0 	boole=True 	word_id=7	previous_word_id=6 	token=Ġit
i=11 	label_index=9 	label=0 	boole=True 	word_id=8	previous_word_id=7 	token=Ġis
i=

IndexError: list index out of range

In [17]:
# use this! keep it unchanged and edit a copy instead! handle </s> via dedicated code!

# match labels to word_ids: if word_id repeats, attach previous label once more to labels_matched
label_index = -1
previous_word_id = False
for i in range(len(word_ids[:-1])):
    word_id = word_ids[i]
    if word_id!=previous_word_id:# or tokens[i]=="</s>":
        label_index += 1
    label = labels_cp[label_index]
    print(f"i={i} \tlabel_index={label_index} \tlabel={label} \tword_id={word_id} \ttoken={tokens[i]}")
    previous_word_id = word_id

i=0 	label_index=0 	label=0 	word_id=None 	token=<s>
i=1 	label_index=1 	label=0 	word_id=0 	token=ĠKnown
i=2 	label_index=2 	label=0 	word_id=1 	token=Ġlocally
i=3 	label_index=3 	label=0 	word_id=2 	token=Ġas
i=4 	label_index=4 	label=8 	word_id=3 	token=Ġ``
i=5 	label_index=5 	label=8 	word_id=4 	token=ĠFair
i=6 	label_index=5 	label=8 	word_id=4 	token=bottom
i=7 	label_index=6 	label=0 	word_id=5 	token=ĠBob
i=8 	label_index=6 	label=0 	word_id=5 	token=s
i=9 	label_index=7 	label=0 	word_id=6 	token=Ġ``
i=10 	label_index=8 	label=0 	word_id=7 	token=Ġit
i=11 	label_index=9 	label=0 	word_id=8 	token=Ġis
i=12 	label_index=10 	label=0 	word_id=9 	token=Ġnow
i=13 	label_index=11 	label=0 	word_id=10 	token=Ġpreserved
i=14 	label_index=12 	label=0 	word_id=11 	token=Ġat
i=15 	label_index=13 	label=2 	word_id=12 	token=Ġthe
i=16 	label_index=14 	label=2 	word_id=13 	token=ĠHenry
i=17 	label_index=15 	label=2 	word_id=14 	token=ĠFord
i=18 	label_index=16 	label=0 	word_id=15 	token=ĠMu

IndexError: list index out of range

In [16]:
previous_word_id = False
label_index = 0
for i in range(len(word_ids)):
    word_id = word_ids[i]
    if word_id==previous_word_id:
        policy = "reuse label_index"
        label = labels[label_index]
    else:
        policy = "update label_index"
        label_index += 1
        label = labels[label_index]
    if word_id==None:
        label=-100
    print(f"i: {i}\tword_id: {word_id}\tpolicy: {policy}\tlabel: {label}\ttoken: {tokens[i]}")
    previous_word_id = word_id

i: 0	word_id: None	policy: update label_index	label: -100	token: <s>
i: 1	word_id: 0	policy: update label_index	label: 0	token: ĠKnown
i: 2	word_id: 1	policy: update label_index	label: 0	token: Ġlocally
i: 3	word_id: 2	policy: update label_index	label: 8	token: Ġas
i: 4	word_id: 3	policy: update label_index	label: 8	token: Ġ``
i: 5	word_id: 4	policy: update label_index	label: 0	token: ĠFair
i: 6	word_id: 4	policy: reuse label_index	label: 0	token: bottom
i: 7	word_id: 5	policy: update label_index	label: 0	token: ĠBob
i: 8	word_id: 5	policy: reuse label_index	label: 0	token: s
i: 9	word_id: 6	policy: update label_index	label: 0	token: Ġ``
i: 10	word_id: 7	policy: update label_index	label: 0	token: Ġit
i: 11	word_id: 8	policy: update label_index	label: 0	token: Ġis
i: 12	word_id: 9	policy: update label_index	label: 0	token: Ġnow
i: 13	word_id: 10	policy: update label_index	label: 0	token: Ġpreserved
i: 14	word_id: 11	policy: update label_index	label: 2	token: Ġat
i: 15	word_id: 12	policy

IndexError: list index out of range

In [17]:
labels, len(labels)

([0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 0, 4, 0], 21)

In [18]:
word_ids[25]==None

True

In [19]:
import copy
mylist = [0, 1]
mylist2 = copy.copy(mylist)
mylist2.append(2)
mylist, mylist2

([0, 1], [0, 1, 2])

In [81]:
labels_cp = copy.copy(labels)
print(len(labels_cp), len(labels))
labels_cp.append(0)
print(len(labels_cp), len(labels))

21 21
22 21


In [20]:
# use this! keep it unchanged and edit a copy instead! handle </s> via dedicated code!

# match labels to word_ids: if word_id repeats, attach previous label once more to labels_matched
label_index = -1
previous_word_id = False
for i in range(len(word_ids[:-1])):
    word_id = word_ids[i]
    if word_id!=previous_word_id:# or tokens[i]=="</s>":
        label_index += 1
    label = labels_cp[label_index]
    print(f"i={i} \tlabel_index={label_index} \tlabel={label} \tword_id={word_id} \ttoken={tokens[i]}")
    previous_word_id = word_id

i=0 	label_index=0 	label=0 	word_id=None 	token=<s>
i=1 	label_index=1 	label=0 	word_id=0 	token=ĠKnown
i=2 	label_index=2 	label=0 	word_id=1 	token=Ġlocally
i=3 	label_index=3 	label=0 	word_id=2 	token=Ġas
i=4 	label_index=4 	label=8 	word_id=3 	token=Ġ``
i=5 	label_index=5 	label=8 	word_id=4 	token=ĠFair
i=6 	label_index=5 	label=8 	word_id=4 	token=bottom
i=7 	label_index=6 	label=0 	word_id=5 	token=ĠBob
i=8 	label_index=6 	label=0 	word_id=5 	token=s
i=9 	label_index=7 	label=0 	word_id=6 	token=Ġ``
i=10 	label_index=8 	label=0 	word_id=7 	token=Ġit
i=11 	label_index=9 	label=0 	word_id=8 	token=Ġis
i=12 	label_index=10 	label=0 	word_id=9 	token=Ġnow
i=13 	label_index=11 	label=0 	word_id=10 	token=Ġpreserved
i=14 	label_index=12 	label=0 	word_id=11 	token=Ġat
i=15 	label_index=13 	label=2 	word_id=12 	token=Ġthe
i=16 	label_index=14 	label=2 	word_id=13 	token=ĠHenry
i=17 	label_index=15 	label=2 	word_id=14 	token=ĠFord
i=18 	label_index=16 	label=0 	word_id=15 	token=ĠMu

IndexError: list index out of range

In [21]:
# match labels to word_ids: if word_id repeats, attach previous label once more to labels_matched
label_index = -1
previous_word_id = False
for i in range(len(word_ids)):
    word_id = word_ids[i]
    label = labels_cp[label_index]
    boole = word_id!=previous_word_id or tokens[i]=="</s>"
    if boole:
        label_index += 1
    print(f"i={i} \tlabel_index={label_index} \tlabel={label} \tboole={boole} \tword_id={word_id}\tprevious_word_id={previous_word_id} \ttoken={tokens[i]}")
    previous_word_id = word_id

i=0 	label_index=0 	label=0 	boole=True 	word_id=None	previous_word_id=False 	token=<s>
i=1 	label_index=1 	label=0 	boole=True 	word_id=0	previous_word_id=None 	token=ĠKnown
i=2 	label_index=2 	label=0 	boole=True 	word_id=1	previous_word_id=0 	token=Ġlocally
i=3 	label_index=3 	label=0 	boole=True 	word_id=2	previous_word_id=1 	token=Ġas
i=4 	label_index=4 	label=0 	boole=True 	word_id=3	previous_word_id=2 	token=Ġ``
i=5 	label_index=5 	label=8 	boole=True 	word_id=4	previous_word_id=3 	token=ĠFair
i=6 	label_index=5 	label=8 	boole=False 	word_id=4	previous_word_id=4 	token=bottom
i=7 	label_index=6 	label=8 	boole=True 	word_id=5	previous_word_id=4 	token=ĠBob
i=8 	label_index=6 	label=0 	boole=False 	word_id=5	previous_word_id=5 	token=s
i=9 	label_index=7 	label=0 	boole=True 	word_id=6	previous_word_id=5 	token=Ġ``
i=10 	label_index=8 	label=0 	boole=True 	word_id=7	previous_word_id=6 	token=Ġit
i=11 	label_index=9 	label=0 	boole=True 	word_id=8	previous_word_id=7 	token=Ġis
i=

IndexError: list index out of range

In [51]:
len(words), len(labels), len(tokens), len(word_ids), word_ids[0]==None

(21, 21, 26, 26, True)

In [61]:
previous_word_id = False
label_index = 0
for i in range(len(word_ids)):
    word_id = word_ids[i]
    repeated_word_id = (word_id==previous_word_id)
    if repeated_word_id:
        label = labels[label_index]
    else:
        label_index+=1
        label = labels[label_index]
        
    print(f"i={i}\tword_id={word_id}\trepeated_word_id: {repeated_word_id}")
    previous_word_id = word_id

i=0	word_id=None	repeated_word_id: False
i=1	word_id=0	repeated_word_id: False
i=2	word_id=1	repeated_word_id: False
i=3	word_id=2	repeated_word_id: False
i=4	word_id=3	repeated_word_id: False
i=5	word_id=4	repeated_word_id: False
i=6	word_id=4	repeated_word_id: True
i=7	word_id=5	repeated_word_id: False
i=8	word_id=5	repeated_word_id: True
i=9	word_id=6	repeated_word_id: False
i=10	word_id=7	repeated_word_id: False
i=11	word_id=8	repeated_word_id: False
i=12	word_id=9	repeated_word_id: False
i=13	word_id=10	repeated_word_id: False
i=14	word_id=11	repeated_word_id: False
i=15	word_id=12	repeated_word_id: False
i=16	word_id=13	repeated_word_id: False
i=17	word_id=14	repeated_word_id: False
i=18	word_id=15	repeated_word_id: False
i=19	word_id=16	repeated_word_id: False
i=20	word_id=17	repeated_word_id: False
i=21	word_id=17	repeated_word_id: True
i=22	word_id=18	repeated_word_id: False


IndexError: list index out of range

In [60]:
previous_word_id = False
label_index = 0
for i in range(len(word_ids)):
    word_id = word_ids[i]
    repeated_word_id = (word_id==previous_word_id)
    if repeated_word_id:
        label = labels[label_index]
    else:
        label_index+=1
        
    print(f"i={i}\tword_id={word_id}\trepeated_word_id: {repeated_word_id}")
    previous_word_id = word_id

i=0	word_id=None	repeated_word_id: False
i=1	word_id=0	repeated_word_id: False
i=2	word_id=1	repeated_word_id: False
i=3	word_id=2	repeated_word_id: False
i=4	word_id=3	repeated_word_id: False
i=5	word_id=4	repeated_word_id: False
i=6	word_id=4	repeated_word_id: True
i=7	word_id=5	repeated_word_id: False
i=8	word_id=5	repeated_word_id: True
i=9	word_id=6	repeated_word_id: False
i=10	word_id=7	repeated_word_id: False
i=11	word_id=8	repeated_word_id: False
i=12	word_id=9	repeated_word_id: False
i=13	word_id=10	repeated_word_id: False
i=14	word_id=11	repeated_word_id: False
i=15	word_id=12	repeated_word_id: False
i=16	word_id=13	repeated_word_id: False
i=17	word_id=14	repeated_word_id: False
i=18	word_id=15	repeated_word_id: False
i=19	word_id=16	repeated_word_id: False
i=20	word_id=17	repeated_word_id: False
i=21	word_id=17	repeated_word_id: True
i=22	word_id=18	repeated_word_id: False
i=23	word_id=19	repeated_word_id: False
i=24	word_id=20	repeated_word_id: False
i=25	word_id=None	repea

In [40]:
i = 17
words[i], labels[i]

('Dearborn', 4)

In [42]:
len(word_ids), len(tokens)

(26, 26)

In [45]:
i = 21
word_ids[i], tokens[i]

(17, 'born')

In [32]:
print(len(words))
words

21


['Known',
 'locally',
 'as',
 '``',
 'Fairbottom',
 'Bobs',
 '``',
 'it',
 'is',
 'now',
 'preserved',
 'at',
 'the',
 'Henry',
 'Ford',
 'Museum',
 'in',
 'Dearborn',
 ',',
 'Michigan',
 '.']

In [33]:
print(len(labels))
labels

21


[0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 0, 4, 0]

In [None]:
# match labels to word_ids: if word_id repeats, attach previous label once more to labels_matched
####labels_matched = []
label_index = -1
previous_word_id = False
for i in range(len(word_ids)):
    word_id = word_ids[i]
    ## handle "None" labels
    ####if word_id==None:
    ####    labels_matched.append(-100)
    # handle all other labels
    label = labels_cp[label_index]
    ## handle repeating word_ids (word split into multiple tokens)
    ####if word_id==previous_word_id and tokens[i]!="</s>":
    ####    labels_matched.append(previous_label)
    ## handle new labels
    #else:
    if word_id!=previous_word_id or tokens[i]=="</s>":
        #labels_matched.append(label)
        label_index += 1
    # output and updates
    print(f"i={i} \tlabel_index={label_index} \tlabel={label} \tword_id={word_id} \ttoken={tokens[i]}")
    previous_word_id = word_id
    previous_label = label
    
#
#labels_matched = labels_matched#[:-1]
#assert len(labels_matched)==len(tokens)
#labels_matched = labels_matched[:-1]
#labels_matched, len(labels_matched)

In [23]:
# match labels to word_ids: if word_id repeats, attach previous label once more to labels_matched
labels_matched = []
label_index = -1
previous_word_id = False
for i in range(len(word_ids)):
    word_id = word_ids[i]
    ## handle "None" labels
    if word_id==None:
        labels_matched.append(-100)
    # handle all other labels
    label = labels_cp[label_index]
    ## handle repeating word_ids (word split into multiple tokens)
    if word_id==previous_word_id and tokens[i]!="</s>":
        labels_matched.append(previous_label)
    ## handle new labels
    else:
        labels_matched.append(label)
        label_index += 1
    # output and updates
    print(f"i={i} \tlabel_index={label_index} \tlabel={label} \tword_id={word_id} \ttoken={tokens[i]}")
    previous_word_id = word_id
    previous_label = label
    
#
#labels_matched = labels_matched#[:-1]
#assert len(labels_matched)==len(tokens)
labels_matched = labels_matched[:-1]
labels_matched, len(labels_matched)

i=0 	label_index=0 	label=0 	word_id=None 	token=<s>
i=1 	label_index=1 	label=0 	word_id=0 	token=ĠKnown
i=2 	label_index=2 	label=0 	word_id=1 	token=Ġlocally
i=3 	label_index=3 	label=0 	word_id=2 	token=Ġas
i=4 	label_index=4 	label=0 	word_id=3 	token=Ġ``
i=5 	label_index=5 	label=8 	word_id=4 	token=ĠFair
i=6 	label_index=5 	label=8 	word_id=4 	token=bottom
i=7 	label_index=6 	label=8 	word_id=5 	token=ĠBob
i=8 	label_index=6 	label=0 	word_id=5 	token=s
i=9 	label_index=7 	label=0 	word_id=6 	token=Ġ``
i=10 	label_index=8 	label=0 	word_id=7 	token=Ġit
i=11 	label_index=9 	label=0 	word_id=8 	token=Ġis
i=12 	label_index=10 	label=0 	word_id=9 	token=Ġnow
i=13 	label_index=11 	label=0 	word_id=10 	token=Ġpreserved
i=14 	label_index=12 	label=0 	word_id=11 	token=Ġat
i=15 	label_index=13 	label=0 	word_id=12 	token=Ġthe
i=16 	label_index=14 	label=2 	word_id=13 	token=ĠHenry
i=17 	label_index=15 	label=2 	word_id=14 	token=ĠFord
i=18 	label_index=16 	label=2 	word_id=15 	token=ĠMu

([-100,
  0,
  0,
  0,
  0,
  0,
  8,
  8,
  8,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  2,
  2,
  0,
  4,
  4,
  0,
  4,
  0,
  -100],
 27)

In [26]:
# match labels to word_ids: if word_id repeats, attach previous label once more to labels_matched
####labels_matched = []
label_index = -1
previous_word_id = False
for i in range(len(word_ids)):
    word_id = word_ids[i]
    ## handle "None" labels
    ####if word_id==None:
    ####    labels_matched.append(-100)
    # handle all other labels
    label = labels_cp[label_index]
    ## handle repeating word_ids (word split into multiple tokens)
    ####if word_id==previous_word_id and tokens[i]!="</s>":
    ####    labels_matched.append(previous_label)
    ## handle new labels
    #else:
    if word_id!=previous_word_id or tokens[i]=="</s>":
        #labels_matched.append(label)
        label_index += 1
    # output and updates
    print(f"i={i} \tlabel_index={label_index} \tlabel={label} \tword_id={word_id} \ttoken={tokens[i]}")
    previous_word_id = word_id
    previous_label = label
    
#
#labels_matched = labels_matched#[:-1]
#assert len(labels_matched)==len(tokens)
#labels_matched = labels_matched[:-1]
#labels_matched, len(labels_matched)

i=0 	label_index=0 	label=0 	word_id=None 	token=<s>
i=1 	label_index=1 	label=0 	word_id=0 	token=ĠKnown
i=2 	label_index=2 	label=0 	word_id=1 	token=Ġlocally
i=3 	label_index=3 	label=0 	word_id=2 	token=Ġas
i=4 	label_index=4 	label=0 	word_id=3 	token=Ġ``
i=5 	label_index=5 	label=8 	word_id=4 	token=ĠFair
i=6 	label_index=5 	label=8 	word_id=4 	token=bottom
i=7 	label_index=6 	label=8 	word_id=5 	token=ĠBob
i=8 	label_index=6 	label=0 	word_id=5 	token=s
i=9 	label_index=7 	label=0 	word_id=6 	token=Ġ``
i=10 	label_index=8 	label=0 	word_id=7 	token=Ġit
i=11 	label_index=9 	label=0 	word_id=8 	token=Ġis
i=12 	label_index=10 	label=0 	word_id=9 	token=Ġnow
i=13 	label_index=11 	label=0 	word_id=10 	token=Ġpreserved
i=14 	label_index=12 	label=0 	word_id=11 	token=Ġat
i=15 	label_index=13 	label=0 	word_id=12 	token=Ġthe
i=16 	label_index=14 	label=2 	word_id=13 	token=ĠHenry
i=17 	label_index=15 	label=2 	word_id=14 	token=ĠFord
i=18 	label_index=16 	label=2 	word_id=15 	token=ĠMu

In [22]:
len(tokens)

26

In [17]:
# match labels to word_ids: if word_id repeats, attach previous label once more to labels_matched
labels.append(0)
labels_matched = []
label_index = -1
previous_word_id = False
for i in range(len(word_ids)):
    word_id = word_ids[i]
    ## handle "None" labels
    if word_id==None:
        labels_matched.append(-100)
    # handle all other labels
    label = labels[label_index]
    ## handle repeating word_ids (word split into multiple tokens)
    if word_id==previous_word_id and tokens[i]!="</s>":
        labels_matched.append(previous_label)
    ## handle new labels
    else:
        labels_matched.append(label)
        label_index += 1
    # output and updates
    print(f"i={i} \tlabel_index={label_index} \tlabel={label} \tword_id={word_id} \ttoken={tokens[i]}")
    previous_word_id = word_id
    previous_label = label
    
#
#labels_matched = labels_matched#[:-1]
#assert len(labels_matched)==len(tokens)
labels_matched

i=0 	label_index=0 	label=0 	word_id=None 	token=<s>
i=1 	label_index=1 	label=0 	word_id=0 	token=ĠKnown
i=2 	label_index=2 	label=0 	word_id=1 	token=Ġlocally
i=3 	label_index=3 	label=0 	word_id=2 	token=Ġas
i=4 	label_index=4 	label=0 	word_id=3 	token=Ġ``
i=5 	label_index=5 	label=8 	word_id=4 	token=ĠFair
i=6 	label_index=5 	label=8 	word_id=4 	token=bottom
i=7 	label_index=6 	label=8 	word_id=5 	token=ĠBob
i=8 	label_index=6 	label=0 	word_id=5 	token=s
i=9 	label_index=7 	label=0 	word_id=6 	token=Ġ``
i=10 	label_index=8 	label=0 	word_id=7 	token=Ġit
i=11 	label_index=9 	label=0 	word_id=8 	token=Ġis
i=12 	label_index=10 	label=0 	word_id=9 	token=Ġnow
i=13 	label_index=11 	label=0 	word_id=10 	token=Ġpreserved
i=14 	label_index=12 	label=0 	word_id=11 	token=Ġat
i=15 	label_index=13 	label=0 	word_id=12 	token=Ġthe
i=16 	label_index=14 	label=2 	word_id=13 	token=ĠHenry
i=17 	label_index=15 	label=2 	word_id=14 	token=ĠFord
i=18 	label_index=16 	label=2 	word_id=15 	token=ĠMu

[-100,
 0,
 0,
 0,
 0,
 0,
 8,
 8,
 8,
 8,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 4,
 4,
 0,
 4,
 0,
 -100,
 0]

In [18]:
len(labels_matched), len(tokens)

(28, 26)

In [19]:
# match labels to word_ids: if word_id repeats, attach previous label once more to labels_matched
labels.append(0)
labels_matched = []
label_index = -1
previous_word_id = False
for i in range(len(word_ids)):
    word_id = word_ids[i]
    ## handle "None" labels
    if word_id==None:
        labels_matched.append(-100)
    # handle all other labels
    label = labels[label_index]
    ## handle repeating word_ids (word split into multiple tokens)
    if word_id==previous_word_id and tokens[i]!="</s>":
        labels_matched.append(previous_label)
    ## handle new labels
    else:
        labels_matched.append(label)
        label_index += 1
    # output and updates
    print(f"i={i} \tlabel_index={label_index} \tlabel={label} \tword_id={word_id} \ttoken={tokens[i]}")
    previous_word_id = word_id
    previous_label = label
    
#
labels_matched = labels_matched[:-1]
labels_matched

i=0 	label_index=0 	label=0 	word_id=None 	token=<s>
i=1 	label_index=1 	label=0 	word_id=0 	token=ĠKnown
i=2 	label_index=2 	label=0 	word_id=1 	token=Ġlocally
i=3 	label_index=3 	label=0 	word_id=2 	token=Ġas
i=4 	label_index=4 	label=0 	word_id=3 	token=Ġ``
i=5 	label_index=5 	label=8 	word_id=4 	token=ĠFair
i=6 	label_index=5 	label=8 	word_id=4 	token=bottom
i=7 	label_index=6 	label=8 	word_id=5 	token=ĠBob
i=8 	label_index=6 	label=0 	word_id=5 	token=s
i=9 	label_index=7 	label=0 	word_id=6 	token=Ġ``
i=10 	label_index=8 	label=0 	word_id=7 	token=Ġit
i=11 	label_index=9 	label=0 	word_id=8 	token=Ġis
i=12 	label_index=10 	label=0 	word_id=9 	token=Ġnow
i=13 	label_index=11 	label=0 	word_id=10 	token=Ġpreserved
i=14 	label_index=12 	label=0 	word_id=11 	token=Ġat
i=15 	label_index=13 	label=0 	word_id=12 	token=Ġthe
i=16 	label_index=14 	label=2 	word_id=13 	token=ĠHenry
i=17 	label_index=15 	label=2 	word_id=14 	token=ĠFord
i=18 	label_index=16 	label=2 	word_id=15 	token=ĠMu

[-100,
 0,
 0,
 0,
 0,
 0,
 8,
 8,
 8,
 8,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 4,
 4,
 0,
 4,
 0,
 -100]

In [20]:
len(labels_matched)

27

In [29]:
# match labels to word_ids: if word_id repeats, attach previous label once more to labels_matched
labels.append(0)
labels_matched = []
label_index = -1
previous_word_id = False
for i in range(len(word_ids)):
    word_id = word_ids[i]
    ## handle "None" labels
    if word_id==None:
        labels_matched.append(-100)
    # handle all other labels
    label = labels[label_index]
    ## handle repeating word_ids (word split into multiple tokens)
    if word_id==previous_word_id and tokens[i]!="</s>":
        labels_matched.append(previous_label)
    ## handle new labels
    else:
        labels_matched.append(label)
        label_index += 1
    # output and updates
    print(f"i={i} \tlabel_index={label_index} \tlabel={label} \tword_id={word_id} \ttoken={tokens[i]}")
    previous_word_id = word_id
    previous_label = label
    
#
labels_matched

i=0 	label_index=0 	label=0 	word_id=None 	token=<s>
i=1 	label_index=1 	label=0 	word_id=0 	token=ĠKnown
i=2 	label_index=2 	label=0 	word_id=1 	token=Ġlocally
i=3 	label_index=3 	label=0 	word_id=2 	token=Ġas
i=4 	label_index=4 	label=0 	word_id=3 	token=Ġ``
i=5 	label_index=5 	label=8 	word_id=4 	token=ĠFair
i=6 	label_index=5 	label=8 	word_id=4 	token=bottom
i=7 	label_index=6 	label=8 	word_id=5 	token=ĠBob
i=8 	label_index=6 	label=0 	word_id=5 	token=s
i=9 	label_index=7 	label=0 	word_id=6 	token=Ġ``
i=10 	label_index=8 	label=0 	word_id=7 	token=Ġit
i=11 	label_index=9 	label=0 	word_id=8 	token=Ġis
i=12 	label_index=10 	label=0 	word_id=9 	token=Ġnow
i=13 	label_index=11 	label=0 	word_id=10 	token=Ġpreserved
i=14 	label_index=12 	label=0 	word_id=11 	token=Ġat
i=15 	label_index=13 	label=0 	word_id=12 	token=Ġthe
i=16 	label_index=14 	label=2 	word_id=13 	token=ĠHenry
i=17 	label_index=15 	label=2 	word_id=14 	token=ĠFord
i=18 	label_index=16 	label=2 	word_id=15 	token=ĠMu

[-100,
 0,
 0,
 0,
 0,
 0,
 8,
 8,
 8,
 8,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 4,
 4,
 0,
 4,
 0,
 -100,
 0]

In [30]:
len(labels_matched)

28

In [19]:
# match labels to word_ids: if word_id repeats, attach previous label once more to labels_matched
labels.append(0)
labels_matched = []
label_index = -1
previous_word_id = False
for i in range(len(word_ids)):
    word_id = word_ids[i]
    ## handle "None" labels
    if word_id==None:
        labels_matched.append(-100)
    # handle all other labels
    label = labels[label_index]
    ## handle repeating word_ids (word split into multiple tokens)
    if word_id==previous_word_id and tokens[i]!="</s>":
        labels_matched.append(previous_label)
    ## handle new labels
    else:
        labels_matched.append(label)
        label_index += 1
    # output and updates
    print(f"i={i} \tlabel_index={label_index} \tlabel={label} \tword_id={word_id} \ttoken={tokens[i]}")
    previous_word_id = word_id
    previous_label = label
    
#
labels_matched

i=0 	label_index=0 	label=0 	word_id=None 	token=<s>
i=1 	label_index=1 	label=0 	word_id=0 	token=ĠKnown
i=2 	label_index=2 	label=0 	word_id=1 	token=Ġlocally
i=3 	label_index=3 	label=0 	word_id=2 	token=Ġas
i=4 	label_index=4 	label=0 	word_id=3 	token=Ġ``
i=5 	label_index=5 	label=8 	word_id=4 	token=ĠFair
i=6 	label_index=5 	label=8 	word_id=4 	token=bottom
i=7 	label_index=6 	label=8 	word_id=5 	token=ĠBob
i=8 	label_index=6 	label=0 	word_id=5 	token=s
i=9 	label_index=7 	label=0 	word_id=6 	token=Ġ``
i=10 	label_index=8 	label=0 	word_id=7 	token=Ġit
i=11 	label_index=9 	label=0 	word_id=8 	token=Ġis
i=12 	label_index=10 	label=0 	word_id=9 	token=Ġnow
i=13 	label_index=11 	label=0 	word_id=10 	token=Ġpreserved
i=14 	label_index=12 	label=0 	word_id=11 	token=Ġat
i=15 	label_index=13 	label=0 	word_id=12 	token=Ġthe
i=16 	label_index=14 	label=2 	word_id=13 	token=ĠHenry
i=17 	label_index=15 	label=2 	word_id=14 	token=ĠFord
i=18 	label_index=16 	label=2 	word_id=15 	token=ĠMu

[-100,
 0,
 0,
 0,
 0,
 0,
 8,
 8,
 8,
 8,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 4,
 4,
 0,
 4,
 0,
 -100,
 0]

In [20]:
#i = 22
#inputs = tokenizer(fewnerd_all[ix]["tokens"], is_split_into_words=True)
#tokens = inputs.tokens()
#word_ids = inputs.word_ids()
#assert len(tokens)==len(word_ids)
print(f"{len(tokens)} tokens and word_ids")
for j in range(len(tokens)):
    print(f"token: {tokens[j]}\tword_id: {word_ids[j]}")

26 tokens and word_ids
token: <s>	word_id: None
token: ĠKnown	word_id: 0
token: Ġlocally	word_id: 1
token: Ġas	word_id: 2
token: Ġ``	word_id: 3
token: ĠFair	word_id: 4
token: bottom	word_id: 4
token: ĠBob	word_id: 5
token: s	word_id: 5
token: Ġ``	word_id: 6
token: Ġit	word_id: 7
token: Ġis	word_id: 8
token: Ġnow	word_id: 9
token: Ġpreserved	word_id: 10
token: Ġat	word_id: 11
token: Ġthe	word_id: 12
token: ĠHenry	word_id: 13
token: ĠFord	word_id: 14
token: ĠMuseum	word_id: 15
token: Ġin	word_id: 16
token: ĠDear	word_id: 17
token: born	word_id: 17
token: Ġ,	word_id: 18
token: ĠMichigan	word_id: 19
token: Ġ.	word_id: 20
token: </s>	word_id: None


In [21]:
len(labels)
match_labels = [-100] + labels + [-100]
match_labels, len(match_labels)

([-100,
  0,
  0,
  0,
  0,
  8,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  2,
  2,
  2,
  0,
  4,
  0,
  4,
  0,
  0,
  0,
  -100],
 25)

In [22]:
word_ids = inputs.word_ids()
word_ids, len(word_ids)

([None,
  0,
  1,
  2,
  3,
  4,
  4,
  5,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  17,
  18,
  19,
  20,
  None],
 26)

In [23]:
previous_word_id = False
for i in range(len(word_ids)):
    word_id = word_ids[i]
    is_previous_word_id = (word_id==previous_word_id)
    print(f"i={i}\tword_id={word_ids[i]}\tis_previous_word_id={is_previous_word_id}")
    # update for next iteration
    previous_word_id = word_id

i=0	word_id=None	is_previous_word_id=False
i=1	word_id=0	is_previous_word_id=False
i=2	word_id=1	is_previous_word_id=False
i=3	word_id=2	is_previous_word_id=False
i=4	word_id=3	is_previous_word_id=False
i=5	word_id=4	is_previous_word_id=False
i=6	word_id=4	is_previous_word_id=True
i=7	word_id=5	is_previous_word_id=False
i=8	word_id=5	is_previous_word_id=True
i=9	word_id=6	is_previous_word_id=False
i=10	word_id=7	is_previous_word_id=False
i=11	word_id=8	is_previous_word_id=False
i=12	word_id=9	is_previous_word_id=False
i=13	word_id=10	is_previous_word_id=False
i=14	word_id=11	is_previous_word_id=False
i=15	word_id=12	is_previous_word_id=False
i=16	word_id=13	is_previous_word_id=False
i=17	word_id=14	is_previous_word_id=False
i=18	word_id=15	is_previous_word_id=False
i=19	word_id=16	is_previous_word_id=False
i=20	word_id=17	is_previous_word_id=False
i=21	word_id=17	is_previous_word_id=True
i=22	word_id=18	is_previous_word_id=False
i=23	word_id=19	is_previous_word_id=False
i=24	word_id=20

In [24]:
# ner_tags = []
# word_id==None => idx_ner_tag = -100
# word_id==previous_word_id => idx_ner_tag = previous_ner_tag
# append idx_ner_tag to list "ner_tags"
len(tokens)==len(word_ids)

True

In [27]:
len(idx_ner_tags), len(tokens)

NameError: name 'ner_tags' is not defined

In [26]:
len(idx_ner_tags), idx_ner_tags

NameError: name 'idx_ner_tags' is not defined

In [None]:
idx_tokens, len(idx_tokens)

In [None]:
# use list of ner_tags (idx_ner_tags) and list of word ids (=word_ids)

In [None]:
tokens

In [None]:
len(idx_ner_tags), len(word_ids), len(tokens)

In [None]:
idx_ner_tags[0]

In [None]:
word_ids

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels

In [None]:
labels, len(labels)

In [None]:
tokens, len(tokens)

In [None]:
word_ids, len(word_ids)

In [None]:
label_index = -1
for i in range(len(tokens)):
    new_labels = []
    word_id = word_ids[i]
    token = tokens[i]
    if word_id==None:
        #print("ignore -100")
        label = -100
        #label_index += 1
    elif word_id==previous_word_id:
        print("previous label")
        label = previous_label
    else:
        print("label index +=1")
        label_index += 1
    previous_word_id = word_id
    previous_label = label
    print(f"i: {i}\tlabel: {label}\tword_id: {word_id}\tlabel_index: {label_index}\ttoken: {token}")

In [None]:
label_index = -1
for i in range(len(tokens)):
    new_labels = []
    word_id = word_ids[i]
    token = tokens[i]
    if word_id==None:
        print("ignore -100")
        label = -100
        #label_index += 1
    elif word_id==previous_word_id:
        print("previous label")
        label = previous_label
    else:
        print("next label")
        label_index += 1
    previous_word_id = word_id
    previous_label = label
    print(f"i: {i}\tlabel: {label}\tword_id: {word_id}\tlabel_index: {label_index}\ttoken: {token}")
# to do next:
# - inspect instance
# - check whether token is <s> or </s> (=> ?)

---

In [None]:
len(labels)

In [None]:
label_index = 0
for i in range(len(tokens)):
    new_labels = []
    word_id = word_ids[i]
    token = tokens[i]
    if word_id==None:
        label = -100
        label_index += 1
    elif word_id==previous_word_id:
        label = previous_label
    else:
        #label_index += 1
    previous_word_id = word_id
    previous_label = label
    print(f"i: {i}\tlabel: {labels[label_index]}\tword_id: {word_id}\ttoken: {token}")

In [None]:
len(labels)

In [None]:
tokens

In [None]:
label_index = 0
for i in range(29):
    new_labels = []
    word_id = word_ids[i]
    if word_id==None:
        #print("ignore -100")
        label = -100
        label_index += 1
    elif word_id==previous_word_id:
        #print("previous label")
        label = previous_label
    else:
        #print("next label")
        #print(word_id)
        label = labels[label_index]
        label_index += 1
    previous_word_id = word_id
    previous_label = label
    print(f"label: {label}\tword_id: {word_id}\ti: {i}")

In [None]:
len(word_ids), word_ids[0], word_ids[-1]

In [None]:
# word_id==None => label = -100
# new word_id => first / next label
# same word_id => use previous label
label_index = 0
for i in range(len(word_ids)):
    word_id = word_ids[i]
    if word_id==None:
        label = -100
    else:
        label = labels[label_index]
    label_index += 1
    previous_word_id = word_id
    pervious_label = label
    print(f"i: {i}\tword_id: {word_id}\tlabel: {label}")
    if i==12:
        break

In [None]:
label_index

In [None]:
label_index = 0
for i in range(len(word_ids)):
    new_labels = []
    word_id = word_ids[i]
    if word_id==None:
        label = -100
        #label_index += 1
    elif word_id==previous_word_id:
        #print("previous label")
        label = previous_label
    else:
        #print("next label")
        #print(word_id)
        label = labels[label_index]
        label_index += 1
    previous_word_id = word_id
    previous_label = label
    print(f"label: {label}\tlabel_index: {label_index}\tword_id: {word_id}\ti: {i}")

In [None]:
len(labels), len(word_ids), len(tokens)

In [None]:
word_ids, len(word_ids)

In [None]:
# 
tokens

In [None]:
word_ids

In [None]:
new_labels = []
current_word = None
for word_id in word_ids:
    print(word_id)
    if word_id != current_word:
        # Start of a new word!
        current_word = word_id
        label = -100 if word_id is None else labels[word_id]
        new_labels.append(label)
#
new_labels

In [None]:
previous_word_id = False
label_index = -2
for i in range(len(word_ids)):
    # word_id
    word_id = word_ids[i]
    # token
    token = tokens[i]
    # label
    if word_id==previous_word_id:
        # reuse previous_label
        label = previous_label
    else:
        # update label
        label_index += 1
        #label = labels[label_index]
    
    # print
    print(f"word_id={word_id}\ttoken={token}\tlabel_index={label_index}")
    # previous_word_id
    previous_word_id = word_id

In [None]:
token[0]

In [None]:
len(labels), labels[20]

In [None]:
# instantiate empty list of matched_labels
# declare previous_word_id = False
# loop over i in range(len(word_ids))
# get word_id_i (=word_ids[i]
# if word_id_i==None, append -100 to matched_labels
# 

In [None]:
len(tokens), len(word_ids), len(labels)

In [None]:
labels

In [None]:
word_ids[-1]==None

In [None]:
previous_word_id = False
label_index = -1
for i in range(len(tokens)):
    token = tokens[i]
    word_id = word_ids[i]
    if word_id==None:
        word_id = -100
    if word_id!=previous_word_id and word_id!=-100:
        #print(i)
        label_index += 1
    previous_word_id = word_id
    label = labels[label_index]
    print(f"item\t{i}\tword_id\t{word_id}\tlabel\t{label}\tlabel_index\t{label_index}")

In [None]:
previous_word_id = False
label_index = 0
for i in range(len(tokens)):
    token = tokens[i]
    word_id = word_ids[i]
    if word_id!=previous_word_id:
        label_index += 1
    #print(labels[label_index])
    print(f"{i}\t{word_id}")

In [None]:
previous_word_id = False
word_id_index = 0
for i in range(len(tokens)):
    token = tokens[i]
    word_id = word_ids[i]
    if word_id!=previous_word_id:
        word_id_index += 1
    print(word_ids[word_id_index])

In [None]:
matched_word_ids = []
previous_word_id = False
word_id_index = 0
for i in range(len(tokens)):
    token = tokens[i]
    word_id = word_ids[i]
    if word_id!=previous_word_id:
        word_id_index += 1
    #print(word_ids[word_id_index])
    matched_word_ids.append(word_ids[word_id_index])
len(matched_word_ids)

In [None]:
for i in range(len(matched_word_ids)):
    print(matched_word_ids[i])

In [None]:
labels

In [None]:
word_ids

In [None]:
tokens

In [None]:
label_index = 0
for i in range(29):
    new_labels = []
    word_id = word_ids[i]
    if word_id==None:
        #print("ignore -100")
        comment = "ignore -100"
        label = -100
        label_index += 1
    elif word_id==previous_word_id:
        #print("previous label")
        comment = "previous label"
        label = previous_label
    else:
        #print("next label")
        comment = "next label"
        #print(word_id)
        label = labels[label_index] # len(labels) = 21
        label_index += 1
    previous_word_id = word_id
    previous_label = label
    print(f"i: {i}\tword_id: {word_id}\tlabel: {label}\tlabel_index: {label_index}\tcomment: {comment}")

In [None]:
align_labels_with_tokens(labels, word_ids)

In [None]:
# loop over word_ids and ner_tags (but word_ids is longer than ner_tags because word_ids comes from tokens and ner_tags from words)
# inside the loop, get the word_id and check whether it is the same as the previous word id: if yes, set ner_tag to the previous ner_tag
previous_word_id = False
ner_tags = []
ner_tag_i = -1
for i, word_id in enumerate(word_ids):
    ner_tag = idx_ner_tags[ner_tag_i]
    if word_id==previous_word_id:
        ner_tag=previous_ner_tag
    else:
        ner_tag_i += 1
        previous_word_id = word_id
    print(f"i: {i}\t ner_tag_i: {ner_tag_i}")
    ner_tags.append(ner_tag)
    previous_ner_tag = ner_tag
#
len(ner_tags), len(word_ids)

In [None]:
current_ner_tag

In [None]:
# loop over word_ids and ner_tags (but word_ids is longer than ner_tags because word_ids comes from tokens and ner_tags from words)
# inside the loop, get the word_id and check whether it is the same as the previous word id: if yes, set ner_tag to the previous word_tag

# loop over tokens


# set current_word_id = -100
# set word_id_idx = 0
# get next word_id
# if word_id==current_word_id: append previous ner_tag once more
current_word_id = False
ner_tag_i = -1
for i in range(len(tokens)):
    # word_ids
    word_id = word_ids[i]
    if word_id==None:
        word_id = -100   
    if word_id==current_word_id:
        ner_tag = current_ner_tag
    else:
        ner_tag_i +=1
    current_word_id = word_id
    # ner_tags
    ner_tag = idx_ner_tags[ner_tag_i]
    current_ner_tag = ner_tag
    
    print(f"i = {i}\tword_id = {word_id}\tner_tag_i = {ner_tag_i}\ttoken = {tokens[i]}")

In [None]:
idx_ner_tags

In [None]:
word_id_list = []
current_word_id = False
for i, word_id in enumerate(word_ids):
    
    
    print(word_id)

In [None]:
token_labels = []
for i in range(len(tokens)):
    print(f"word_id:\t{word_ids[i]}\ttoken:\t{tokens[i]}")

In [None]:
labels

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels

In [None]:
labels = fewnerd_all[1]["ner_tags"]
print(labels)
word_ids = inputs.word_ids()
print(align_labels_with_tokens(labels, word_ids))

In [None]:
i = 22
words = fewnerd_all[i]["tokens"]
labels = fewnerd_all[i]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)


In [1]:
%env WANDB_SILENT=True

env: WANDB_SILENT=True


In [None]:
tokenizer.tokenize("Pakistani")

In [None]:
labels = fewnerd_all.features["ner_tags"].feature.names
labels

In [None]:
id2label = {k: v for k, v in enumerate(labels)}
label2id = {v: k for k, v in id2label.items()}
id2label, label2id

In [None]:
# tokenize an instance
# get tokens
# get word_ids
# adapt labels to word_ids
fewnerd_all

In [None]:
i = 22 # 17, 22
fewnerd_all[i]["words"], fewnerd_all[i]["ner_tags"]

In [None]:
fewnerd_all["ner_tags"][8]

In [None]:
for i in fewnerd_all.features["ner_tags"]:
    print(i)

In [None]:
coarse_types = ["Location", "Person", "ORG", "Building", "Art", "Product", "Event", "Misc"]
fine_types = [
    # Location
    "GPE", "Body of Water", "Island", "Mountain", "Park", "Road/Transit", "Other",
	# Person
	"Actor", "Aritst/Author", "Director", "Politician", "Scholar", "Soldier", "Other",
	# ORG
	"Company", "Education", "Government", "Media", "Politician/party", "Religion", "Sports League", "Sports Team", "Show ORG", "Other",
	# Building
    "Airport", "Hospital", "Hotel", "Library", "Restaurant", "Sports Facility", "Theater", "Other",
    # Art
    "Music", "Film", "Written Arg", "Broadcast", "Painting", "Other",
	# Product
	"Airplane", "Car", "Food", "Game", "Ship", "Software", "Train", "Weapon", "Other",
	# Event
	"Attack", "Election", "Natural Disaster", "Protest", "Sports Event", "Other",
	# Misc
    "Astronomy", "Award", "Biology", "Chemistry", "Currency", "Disease", "Educational Degree", "God", "Language", "Law", "Living Thing", "Medical"
]

In [None]:
id2label_coarse = {str(i): label for i, label in enumerate(coarse_types)}
label2id_coarse ={v: k for k, v in id2label_coarse.items()}
id2label_coarse, label2id_coarse

In [None]:
#id2label_fine = {str(i): label for i, label in enumerate(fine_types)}
#label2id_fine ={v: k for k, v in id2label_fine.items()}
#id2label_fine, label2id_fine

In [None]:
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import torch
import numpy as np

checkpoint = "FacebookAI/roberta-large"
lr = 1e-3
batch_size = 16
num_epochs = 10
# load model
# load tokenizer
# load dataset
# get dataset instance (text and ner_labels)
# align text with ner_labels
# get tokenized text
# align tokenized text with ner_labels

In [None]:
# load model
model = AutoModelForTokenClassification.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
my_text = "Some fancy, non-squiggly text."
my_text_tokenized = tokenizer.tokenize(my_text)
my_text_tokenized

In [None]:
my_text_encoded = tokenizer.encode("some fancy, non-squiggly text")
my_text_encoded

In [None]:
my_text_decoded = tokenizer.decode(my_text_encoded)
my_text_decoded

In [None]:
fewnerd = load_dataset("DFKI-SLT/few-nerd", "supervised")
fewnerd_all = concatenate_datasets([fewnerd["train"], fewnerd["validation"], fewnerd["test"]])
fewnerd_all = fewnerd_all.rename_column("tokens", "text")
fewnerd_all.features

In [None]:
inputs = tokenizer("some fancy, non-squiggly text")#, is_split_into_words=True)
inputs.tokens()

In [None]:
print(len(inputs.word_ids())==len(inputs.tokens()))
inputs.word_ids(), inputs.tokens()

In [None]:
tokenizer.encode(inputs)

In [None]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

## Load dataset and metric
The [BioNLP2004](https://huggingface.co/datasets/tner/bionlp2004) dataset includes tokens and tags for biological structures like DNA, RNA and proteins. Load the dataset:

In [None]:
fewnerd = load_dataset("DFKI-SLT/few-nerd", "supervised")
fewnerd_all = concatenate_datasets([fewnerd["train"], fewnerd["validation"], fewnerd["test"]])
fewnerd_all = fewnerd_all.rename_column("tokens", "text")
fewnerd_all

In [None]:
i = 8
fewnerd_all["ner_tags"][i]

In [None]:
fewnerd_all["text"][i]

In [None]:
i = 7
len(fewnerd_all["ner_tags"][i])==len(fewnerd_all["text"][i])

Load the [**fewnerd**](https://arxiv.org/pdf/2105.07464v6.pdf) dataset and read the according [**publication**](https://aclanthology.org/2021.acl-long.248/)!

In [None]:
from datasets import DatasetDict
# 0.1
# 100 * 0.1 = 10 => 10 for test, 90 for train + valid; 90 * 0.1 = 9 for valid => 81 for train
# 0.15
# 100 * 0.15 = 15 => 15 for test, 85 for train + valid; 85 * 0.15 = 12.75 for valid => 72.75 for train
# 0.2
# 100 * 0.2 = 20 => 20 for test, 80 for train + valid; 80 * 0.2 = 16 for valid => 64 for train
dataset_cc = concatenate_datasets([fewnerd["train"], fewnerd["validation"], fewnerd["test"]])
dev_split = dataset_cc.train_test_split(test_size=4)["test"]
trainvalid_test_splits = dataset_cc.train_test_split(test_size=0.15) # train 81% valid 9% test 10%
test_split = trainvalid_test_splits["test"]
trainvalid_split = trainvalid_test_splits["train"]
train_valid_split = trainvalid_split.train_test_split(test_size=0.15)
valid_split = train_valid_split["test"]
train_split = train_valid_split["train"]
dataset_fewnerd = DatasetDict({
    "train": train_split,
    "valid": valid_split,
    "test": test_split,
    "dev": dev_split
}).remove_columns(["id", "ner_tags"])
dataset_fewnerd

In [None]:
!ls 1_ner

In [None]:
!ls 1_ner

In [None]:
import json
json_file_path = "/path/to/example.json"
json_file_path = "1_ner/few_ner_labels.json"

with open(json_file_path, "r") as j:
     contents = json.loads(j.read())
contents

In [None]:
import json
json_file_path = "/path/to/example.json"
json_file_path = "1_ner/few_ner_labels.json"

with open(json_file_path, "r") as j:
     contents = json.loads(j.read())
contents

In [None]:
!ls 1_ner

In [None]:
import json import
with open("1_ner/few_ner_labels.json", "r") as file:
    labels = load(file)
labels

In [None]:
import json
json.loads("1_ner/few_ner_labels.json")

In [None]:
dataset_cc["fine_ner_tags"]

In [None]:
dataset_fewnerd["dev"]

In [None]:
# relevant keys: "tokens" (rename to "words"), "fine_ner_tags"; irrelevant keys: "id", "ner_tags"
dataset_fewnerd["dev"][0].keys() # keys: "id", "tokens", "ner_tags", "fine_ner_tags" 

In [None]:
len(dataset_fewnerd["dev"][0]["fine_ner_tags"]) # use fine_ner_tags for a challenge!

In [None]:
i = 94
dataset_fewnerd["train"][i]["fine_ner_tags"]

In [None]:
dataset_fewnerd["train"][i]["tokens"]

In [None]:
# 0.1
# 100 * 0.1 = 10 => 10 for test, 90 for train + valid; 90 * 0.1 = 9 for valid => 81 for train
# 0.15
# 100 * 0.15 = 15 => 15 for test, 85 for train + valid; 85 * 0.15 = 12.75 for valid => 72.75 for train
# 0.2
# 100 * 0.2 = 20 => 20 for test, 80 for train + valid; 80 * 0.2 = 16 for valid => 64 for train
bionlp = load_dataset("tner/bionlp2004")
# each dataset instance is a dictionary with keys "tokens" (or "words") and "tags".
# under these keys the dictionary has equally long lists of tokens and tags
bionlp

In [None]:
bionlp["train"][0]

In [None]:
bionlp["train"][0].keys()

The `tags` values are defined in the label ids [dictionary](https://huggingface.co/datasets/tner/bionlp2004#label-id). The letter that prefixes each label indicates the token position: `B` is for the first token of an entity, `I` is for a token inside the entity, and `0` is for a token that is not part of an entity.

```
{
    "O": 0,
    "B-DNA": 1,
    "I-DNA": 2,
    "B-protein": 3,
    "I-protein": 4,
    "B-cell_type": 5,
    "I-cell_type": 6,
    "B-cell_line": 7,
    "I-cell_line": 8,
    "B-RNA": 9,
    "I-RNA": 10,
}
```

Then load the [`seqeval`](https://huggingface.co/spaces/evaluate-metric/seqeval) framework which includes several metrics - precision, accuracy, F1, and recall - for evaluating sequence labeling tasks.

In [None]:
seqeval = evaluate.load("seqeval")

Now you can write an evaluation function to compute the metrics from the model predictions and labels, and return the precision, recall, $F_1$, and accuracy scores:

In [None]:
label_list = [
    "O",
    "B-DNA",
    "I-DNA",
    "B-protein",
    "I-protein",
    "B-cell_type",
    "I-cell_type",
    "B-cell_line",
    "I-cell_line",
    "B-RNA",
    "I-RNA",
]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## Preprocess dataset
Initialize a tokenizer and make sure you set `is_split_into_words=True` because the text sequence has already been split into words. However, this doesn't mean it is tokenized yet (even though it may look like it!), and you'll need to further tokenize the words into subwords.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

You’ll also need to write a function to:
1. Map each token to their respective word with the [`word_ids`](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/tokenizer#transformers.BatchEncoding.word_ids) method.
1. Ignore the special tokens by setting them to `-100`.
1.  Label the first token of a given entity.

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples[f"tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

Use [`map`](https://huggingface.co/docs/datasets/v2.16.1/en/package_reference/main_classes#datasets.Dataset.map) to apply the `tokenize_and_align_labels` function to the dataset:

In [None]:
tokenized_bionlp = bionlp.map(tokenize_and_align_labels, batched=True)
tokenized_bionlp

Finally, create a data collator to pad the examples to the longest length in a batch:

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
data_collator

## Train
Now you're ready to create a [PeftModel](https://huggingface.co/docs/peft/v0.8.2/en/package_reference/peft_model#peft.PeftModel). Start by loading the base `roberta-large` model, the number of expected labels, and the `id2label` and `label2id` dictionaries:

In [None]:
id2label = {
    0: "O",
    1: "B-DNA",
    2: "I-DNA",
    3: "B-protein",
    4: "I-protein",
    5: "B-cell_type",
    6: "I-cell_type",
    7: "B-cell_line",
    8: "I-cell_line",
    9: "B-RNA",
    10: "I-RNA",
}
label2id = {
    "O": 0,
    "B-DNA": 1,
    "I-DNA": 2,
    "B-protein": 3,
    "I-protein": 4,
    "B-cell_type": 5,
    "I-cell_type": 6,
    "B-cell_line": 7,
    "I-cell_line": 8,
    "B-RNA": 9,
    "I-RNA": 10,
}

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=11,
    id2label=id2label,
    label2id=label2id
)

Define the [LoraConfig](https://huggingface.co/docs/peft/v0.8.2/en/package_reference/lora#peft.LoraConfig) with:
- `task_type`, token classification (`TaskType.TOKEN_CLS`)
- `r`, the dimension of the low-rank matrices
- `lora_alpha`, scaling factor for the weight matrices
- `lora_dropout`, dropout probability of the LoRA layers
- `bias`, set to `all` to train all bias parameters

> <font style="color:darkgreen">💡 The weight matrix is scaled by `lora_alpha/r`, and a higher `lora_alpha` value assigns more weight to the LoRA activations. For performance, we recommend setting `bias` to `None` first, and then `lora_only`, before trying `all`.</font>

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS,
    inference_mode=False,
    r=16,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="all"
)

Pass the base model and `peft_config` to the [`get_peft_model()`](https://huggingface.co/docs/peft/v0.8.2/en/package_reference/peft_model#peft.get_peft_model) function to create a [PeftModel](https://huggingface.co/docs/peft/v0.8.2/en/package_reference/peft_model#peft.PeftModel). You can check out how much more efficient training the PeftModel is compared to fully training the base model by printing out the trainable parameters:

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

From the 🤗 Transformers library, create a [TrainingArguments](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/trainer#transformers.TrainingArguments) class and specify where you want to save the model to, the training hyperparameters, how to evaluate the model, and when to save the checkpoints:

In [None]:
training_args = TrainingArguments(
    output_dir="ner/logs/roberta-large-lora-token-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

Pass the model, `TrainingArguments`, datasets, tokenizer, data collator and evaluation function to the [Trainer](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/trainer#transformers.Trainer) class. The `Trainer` handles the training loop for you, and when you're ready, call [`train`](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/trainer#transformers.Trainer.train) to begin!

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_bionlp["train"],
    eval_dataset=tokenized_bionlp["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()

## Share model
Once training is complete, you can store and share your model on the Hub if you'd like. Log in to your HuggingFace account and enter your token when prompted:

In [None]:
from huggingface_hub import notebook_login
notebook_login()

Upload the model to a specific model repository on the Hub with the [`push_to_hub`](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/model#transformers.PreTrainedModel.push_to_hub) method:

In [None]:
model.push_to_hub("mdroth/roberta-large-lora-token-classification")

##  Inference
To use your model for inference, load the configuration and model: $1+1=2$.

In [None]:
peft_model_id = "mdroth/roberta-large-lora-token-classification"
config = PeftConfig.from_pretrained(peft_model_id)
inference_model = AutoModelForTokenClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=11, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(inference_model, peft_model_id)

Get some text to tokenize:

In [None]:
text = "The activation of IL-2 gene expression and NF-kappa B through CD28 requires reactive oxygen production by 5-lipoxygenase."
inputs = tokenizer(text, return_tensors="pt")
inputs

Pass the inputs to the model, and print out the model prediction for each token:

In [None]:
with torch.no_grad():
    logits = model(**inputs).logits
tokens = inputs.tokens()
predictions = torch.argmax(logits, dim=2)
for token, prediction in zip(tokens, predictions[0].numpy()):
    print((token, model.config.id2label[prediction]))

<font style="font-weight:300">✔</font>