# Token classification with LoRA
Done:
- count flops via [flops-profiler](https://pypi.org/project/flops-profiler/) in accelerate loop

ToDo:
- backup (always – never finished)
- use LoRA model
- find batch size automatically
- add wandb sweep
- tidy up

In [1]:
import torch
import numpy as np
from torch.utils.data import DataLoader
from torch.optim import AdamW
from datasets import load_dataset, concatenate_datasets, DatasetDict
from transformers import pipeline, AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, get_scheduler
from evaluate import load
from accelerate import Accelerator
from tqdm.auto import tqdm
from deepspeed.profiling.flops_profiler import FlopsProfiler
fewnerd = load_dataset("DFKI-SLT/few-nerd", "supervised")
fewnerd_all = concatenate_datasets([fewnerd["train"], fewnerd["validation"], fewnerd["test"]]).rename_column("tokens", "words")
fewnerd_all

[2024-02-22 15:57:05,248] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Dataset({
    features: ['id', 'words', 'ner_tags', 'fine_ner_tags'],
    num_rows: 188239
})

In [2]:
x = 104 # => 188024 (104 works, 105 doesn't work [CUDA OOM])
fewnerd_all = fewnerd_all.filter(lambda example: len(example["words"])<=x)
fewnerd_all

Dataset({
    features: ['id', 'words', 'ner_tags', 'fine_ner_tags'],
    num_rows: 188024
})

In [3]:
idx = 22
fewnerd_all[idx]["words"], fewnerd_all[idx]["ner_tags"]

(['Known',
  'locally',
  'as',
  '``',
  'Fairbottom',
  'Bobs',
  '``',
  'it',
  'is',
  'now',
  'preserved',
  'at',
  'the',
  'Henry',
  'Ford',
  'Museum',
  'in',
  'Dearborn',
  ',',
  'Michigan',
  '.'],
 [0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 0, 4, 0])

In [4]:
label_names = fewnerd_all.features["ner_tags"].feature.names
print(label_names)
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
id2label, label2id

['O', 'art', 'building', 'event', 'location', 'organization', 'other', 'person', 'product']


({'0': 'O',
  '1': 'art',
  '2': 'building',
  '3': 'event',
  '4': 'location',
  '5': 'organization',
  '6': 'other',
  '7': 'person',
  '8': 'product'},
 {'O': '0',
  'art': '1',
  'building': '2',
  'event': '3',
  'location': '4',
  'organization': '5',
  'other': '6',
  'person': '7',
  'product': '8'})

In [5]:
words = fewnerd_all[idx]["words"]
labels = fewnerd_all[idx]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)

Known locally as `` Fairbottom Bobs    `` it is now preserved at the Henry    Ford     Museum   in Dearborn , Michigan . 
O     O       O  O  product    product O  O  O  O   O         O  O   building building building O  location O location O 


In [6]:
model_checkpoint = "FacebookAI/roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
print(tokenizer)
tokenizer.is_fast

RobertaTokenizerFast(name_or_path='FacebookAI/roberta-large', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}


True

In [7]:
inputs = tokenizer(fewnerd_all[idx]["words"], is_split_into_words=True)
inputs.tokens(), inputs.word_ids()

(['<s>',
  'ĠKnown',
  'Ġlocally',
  'Ġas',
  'Ġ``',
  'ĠFair',
  'bottom',
  'ĠBob',
  's',
  'Ġ``',
  'Ġit',
  'Ġis',
  'Ġnow',
  'Ġpreserved',
  'Ġat',
  'Ġthe',
  'ĠHenry',
  'ĠFord',
  'ĠMuseum',
  'Ġin',
  'ĠDear',
  'born',
  'Ġ,',
  'ĠMichigan',
  'Ġ.',
  '</s>'],
 [None,
  0,
  1,
  2,
  3,
  4,
  4,
  5,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  17,
  18,
  19,
  20,
  None])

In [8]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels

labels = fewnerd_all[22]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 0, 4, 0]
[-100, 0, 0, 0, 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 4, 0, 4, 0, -100]


In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

fewnerd_all_tokenized = fewnerd_all.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=fewnerd_all.column_names
)
fewnerd_all_tokenized

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 188024
})

In [10]:
# make splits
dev_split = fewnerd_all_tokenized.train_test_split(test_size=4)["test"]
trainvalid_test_splits = fewnerd_all_tokenized.train_test_split(test_size=0.15)
trainvalid_split = trainvalid_test_splits["train"]
test_split_100 = trainvalid_test_splits["test"]
test_split_10 = test_split_100.train_test_split(test_size=0.1)["test"]
test_split_1 = test_split_10.train_test_split(test_size=0.1)["test"]
train_valid_split = trainvalid_split.train_test_split(test_size=0.15)
train_split_100 = train_valid_split["train"]
train_split_10 = train_split_100.train_test_split(test_size=0.1)["test"]
train_split_1 = train_split_10.train_test_split(test_size=0.1)["test"]
valid_split_100 = train_valid_split["test"]
valid_split_10 = valid_split_100.train_test_split(test_size=0.1)["test"]
valid_split_1 = valid_split_10.train_test_split(test_size=0.1)["test"]
fewnerd_ds = DatasetDict({
    "train_100": train_split_100,
    "valid_100": valid_split_100,
    "test_100": test_split_100,
    "train_10": train_split_10,
    "valid_10": valid_split_10,
    "test_10": test_split_10,
    "train_1": train_split_1,
    "valid_1": valid_split_1,
    "test_1": test_split_1,
    "dev": dev_split
})
fewnerd_ds

DatasetDict({
    train_100: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 135847
    })
    valid_100: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 23973
    })
    test_100: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 28204
    })
    train_10: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 13585
    })
    valid_10: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2398
    })
    test_10: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2821
    })
    train_1: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1359
    })
    valid_1: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 240
    })
    test_1: Dataset({
        features: ['input_ids', 'attentio

In [11]:
instance = fewnerd_ds["dev"][0]
keys = instance.keys()
print(keys)
for key in keys:
    print(len(instance[key]))

dict_keys(['input_ids', 'attention_mask', 'labels'])
14
14
14


In [12]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
data_collator

DataCollatorForTokenClassification(tokenizer=RobertaTokenizerFast(name_or_path='FacebookAI/roberta-large', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multipl

In [13]:
batch = data_collator([fewnerd_ds["train_1"][i] for i in range(2)])
batch["labels"]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    4,    4,    4,    4,    4,    4,    4,    0,    0,    0,    0,
            0,    0,    4,    4,    4,    4,    0,    4,    4,    0,    4,    0,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100],
        [-100,    0,    0,    0,    0,    0,    4,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    4,    4,    4,    0,    4,    4,    4,    0,
            4,    4,    4,    4,    0,    0,    0,    0,    4,    4,    4,    0,
            0,    0,    0,    0,    0,    0, -100]])

In [14]:
labels

[0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 0, 4, 0]

In [15]:
metric = load("seqeval")

In [16]:
#labels = fewnerd_ds["train"][0]["labels"]
#labels = [label_names[i] for i in labels]
#labels

In [17]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [28]:
dev = False
if dev:
    train_dataloader = DataLoader(fewnerd_ds["dev"], shuffle=True, collate_fn=data_collator, batch_size=8) # 8, 4, 2
    eval_dataloader = DataLoader(fewnerd_ds["dev"], collate_fn=data_collator, batch_size=8)
else:
    train_dataloader = DataLoader(fewnerd_ds["train_10"], shuffle=True, collate_fn=data_collator, batch_size=8) # 8, 4, 2
    eval_dataloader = DataLoader(fewnerd_ds["valid_10"], collate_fn=data_collator, batch_size=8)

In [29]:
import math
from peft import PeftModel
from peft import LoraConfig, prepare_model_for_int8_training, get_peft_model
from peft import get_peft_model, LoraConfig, TaskType
# datasets:      3 values [1%, 10%, 100%]
# lora_rank:    10 values [1, ..., 512]
# lora_dropout:  3 values [0, 0.2, 0.4]

r = 4
config = LoraConfig(
    # GUIDE   => https://huggingface.co/docs/peft/main/en/conceptual_guides/lora#common-lora-parameters-in-peft
    # https://arxiv.org/abs/2312.03732, https://huggingface.co/docs/peft/main/en/conceptual_guides/lora#common-lora-parameters-in-peft:~:text=use_rslora%3A%20When%20set%20to%20True%2C%20uses%20Rank%2DStabilized%20LoRA%20which%20sets%20the%20adapter%20scaling%20factor
    r=r,
    target_modules=["query", "key", "value", "query_proj", "key_proj", "value_proj"],
    bias="all",
    use_rslora=True, 
    task_type=TaskType.TOKEN_CLS,
    lora_dropout=0.2
)
print(config)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
    load_in_8bit=True,
    device_map="auto"
)
model = prepare_model_for_int8_training(model)
model = get_peft_model(model, config)

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.TOKEN_CLS: 'TOKEN_CLS'>, inference_mode=False, r=4, target_modules={'value_proj', 'query', 'query_proj', 'key_proj', 'key', 'value'}, lora_alpha=8, lora_dropout=0.2, fan_in_fan_out=False, bias='all', use_rslora=True, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={})


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading adapter weights from logs led to unexpected keys not found in the model:  ['classifier.modules_to_save.bias', 'classifier.original_module.bias']. 


```
model = LlamaForTokenClassification.from_pretrained(
    model_id, num_labels=len(label2id), id2label=id2label, label2id=label2id
).bfloat16()
peft_config = LoraConfig(task_type=TaskType.TOKEN_CLS, inference_mode=False, r=lora_r, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
```

In [30]:
optimizer = AdamW(model.parameters(), lr=2e-5)
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model,
    optimizer,
    train_dataloader,
    eval_dataloader
)

In [31]:
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [32]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
output_dir = "logs"

prof = FlopsProfiler(model) # deepspeed profiler
profile_step = 5
flops_list = []

progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
    # Training
    model.train()
    prof.start_profile() # start profiling
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    #
    prof.stop_profile() # stop profiling
    total_flops = prof.get_total_flops()
    flops_list.append(total_flops)
    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)
        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)
    results = metric.compute()
    print(f"epoch {epoch}:", {key: results[f"overall_{key}"] for key in ["precision", "recall", "f1", "accuracy"]})
    # Save model and tokenizer
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
#
prof.end_profile() # end profiling

  0%|          | 0/5097 [00:00<?, ?it/s]

[2024-02-22 16:06:19,558] [INFO] [profiler.py:80:start_profile] Flops profiler started


In [25]:
np_flops_list = np.array(flops_list)
print(flops_list)
np.mean(np_flops_list), np.sum(np_flops_list)

[1401275658240, 1392458122240, 1423246919680]


(1405660233386.6667, 4216980700160)

In [26]:
model_checkpoint = output_dir # local folder for model checkpoint
token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="simple")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[{'entity_group': 'LABEL_1',
  'score': 0.75313264,
  'word': 'My name is Sylvain and I work at Hugging Face in Brooklyn.',
  'start': 0,
  'end': 58}]

In [27]:
for i in range(10):
    rank = 2**i
    print(rank)

1
2
4
8
16
32
64
128
256
512
