## Token classification with LoRA
ToDo:
- backup
- count flops via [flops-profiler](https://pypi.org/project/flops-profiler/) in accelerate loop
- use LoRA model
- find batch size automatically
- add wandb sweep
- tidy up

In [1]:
from deepspeed.profiling.flops_profiler import FlopsProfiler
from datasets import load_dataset, concatenate_datasets, DatasetDict
fewnerd = load_dataset("DFKI-SLT/few-nerd", "supervised")
fewnerd_all = concatenate_datasets([fewnerd["train"], fewnerd["validation"], fewnerd["test"]]).rename_column("tokens", "words")
fewnerd_all

[2024-02-21 02:20:16,039] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Dataset({
    features: ['id', 'words', 'ner_tags', 'fine_ner_tags'],
    num_rows: 188239
})

In [2]:
# results
# does work   : 50, 80, 90, 100, 104
# doesn't work: 105, 110, 120, 140
x = 104 # => 188024
fewnerd_all = fewnerd_all.filter(lambda example: len(example["words"])<=x)
fewnerd_all

Filter:   0%|          | 0/188239 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'words', 'ner_tags', 'fine_ner_tags'],
    num_rows: 188024
})

In [3]:
fewnerd_all[0]["words"]

['Paul', 'International', 'airport', '.']

In [4]:
fewnerd_all[0]["ner_tags"]

[0, 0, 0, 0]

In [5]:
ner_feature = fewnerd_all.features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'art', 'building', 'event', 'location', 'organization', 'other', 'person', 'product'], id=None), length=-1, id=None)

In [6]:
label_names = ner_feature.feature.names
label_names

['O',
 'art',
 'building',
 'event',
 'location',
 'organization',
 'other',
 'person',
 'product']

In [7]:
words = fewnerd_all[0]["words"]
labels = fewnerd_all[0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)

Paul International airport . 
O    O             O       O 


In [8]:
from transformers import AutoTokenizer
model_checkpoint = "FacebookAI/roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
tokenizer

RobertaTokenizerFast(name_or_path='FacebookAI/roberta-large', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}

In [9]:
tokenizer.is_fast

True

In [10]:
inputs = tokenizer(fewnerd_all[0]["words"], is_split_into_words=True)
inputs.tokens()

['<s>', 'ĠPaul', 'ĠInternational', 'Ġairport', 'Ġ.', '</s>']

In [11]:
inputs.word_ids()

[None, 0, 1, 2, 3, None]

In [12]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels

In [13]:
labels = fewnerd_all[0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0]
[-100, 0, 0, 0, 0, -100]


In [14]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [15]:
fewnerd_all_tokenized = fewnerd_all.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=fewnerd_all.column_names
)
fewnerd_all_tokenized

Map:   0%|          | 0/188024 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 188024
})

In [16]:
# make splits
dev_split = fewnerd_all_tokenized.train_test_split(test_size=4)["test"]
trainvalid_test_splits = fewnerd_all_tokenized.train_test_split(test_size=0.15)
test_split = trainvalid_test_splits["test"]
trainvalid_split = trainvalid_test_splits["train"]
train_valid_split = trainvalid_split.train_test_split(test_size=0.15)
train_split = train_valid_split["train"]
valid_split = train_valid_split["test"]
fewnerd_ds = DatasetDict({
    "train": train_split,
    "valid": valid_split,
    "test": test_split,
    "dev": dev_split
})
fewnerd_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 135847
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 23973
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 28204
    })
    dev: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4
    })
})

In [17]:
instance = fewnerd_ds["dev"][0]
keys = instance.keys()
print(keys)
for key in keys:
    print(len(instance[key]))

dict_keys(['input_ids', 'attention_mask', 'labels'])
15
15
15


In [18]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
data_collator

DataCollatorForTokenClassification(tokenizer=RobertaTokenizerFast(name_or_path='FacebookAI/roberta-large', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multipl

In [19]:
batch = data_collator([fewnerd_ds["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    0,    0,    4,    4,    0,    4,    0,
            0,    0,    0,    0,    0,    0,    0,    7,    8,    8,    0,    0,
            5,    5,    5,    5,    7,    8,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    7,
            8,    0,    0,    0,    0,    0, -100],
        [-100,    0,    5,    6,    0,    0,    0,    0,    0,    0,    0, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100]])

In [20]:
labels

[0, 0, 0, 0]

In [21]:
from evaluate import load
metric = load("seqeval")

In [22]:
#labels = fewnerd_ds["train"][0]["labels"]
#labels = [label_names[i] for i in labels]
#labels

In [23]:
import numpy as np
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [24]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
id2label, label2id

({'0': 'O',
  '1': 'art',
  '2': 'building',
  '3': 'event',
  '4': 'location',
  '5': 'organization',
  '6': 'other',
  '7': 'person',
  '8': 'product'},
 {'O': '0',
  'art': '1',
  'building': '2',
  'event': '3',
  'location': '4',
  'organization': '5',
  'other': '6',
  'person': '7',
  'product': '8'})

In [25]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(fewnerd_ds["train"], shuffle=True, collate_fn=data_collator, batch_size=8) # 8, 4, 2
eval_dataloader = DataLoader(fewnerd_ds["valid"], collate_fn=data_collator, batch_size=8)

In [27]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=2e-5)

In [29]:
from accelerate import Accelerator
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model,
    optimizer,
    train_dataloader,
    eval_dataloader
)

In [30]:
from transformers import get_scheduler
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [31]:
from huggingface_hub import Repository, get_full_repo_name
model_name = "FacebookAI_roberta-large-finetuned-ner-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

'mdroth/FacebookAI_roberta-large-finetuned-ner-accelerate'

In [32]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [33]:
output_dir = "logs"

In [None]:
from tqdm.auto import tqdm
import torch

prof = FlopsProfiler(model) # deepspeed profiler
profile_step = 5
flops_list = []

progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
    # Training
    model.train()
    prof.start_profile() # start profiling
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    #
    prof.stop_profile()  # stop profiling
    total_flops = prof.get_total_flops()
    print(f"{total_flops}")
    flops_list.append(total_flops)
    prof.print_model_profile(profile_step=profile_step)
    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]
        # pad predictions and labels before being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)
        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)
    results = metric.compute()
    print(f"epoch {epoch}:", {key: results[f"overall_{key}"] for key in ["precision", "recall", "f1", "accuracy"]})
    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
#
print(f"{prof.get_total_flops()}")
prof.end_profile() # end profiling

  0%|          | 0/50943 [00:00<?, ?it/s]

[2024-02-21 02:20:46,416] [INFO] [profiler.py:80:start_profile] Flops profiler started
4637609522103936

-------------------------- DeepSpeed Flops Profiler --------------------------
Profile Summary at step 5:
Notations:
data parallel size (dp_size), model parallel size(mp_size),
number of parameters (params), number of multiply-accumulate operations(MACs),
number of floating-point operations (flops), floating-point operations per second (FLOPS),
fwd latency (forward propagation latency), bwd latency (backward propagation latency),
step (weights update latency), iter latency (sum of fwd, bwd and step latency)

params per GPU:                                                         354.32 M
params of model = params per GPU * mp_size:                             0       
fwd MACs per GPU:                                                       2317.76 TMACs
fwd flops per GPU:                                                      4637.61 T
fwd flops of model = fwd flops per GPU * mp_size:  



epoch 0: {'precision': 0.8309083339560405, 'recall': 0.783785960990444, 'f1': 0.8066595504337102, 'accuracy': 0.938159417036487}
[2024-02-21 03:41:05,862] [INFO] [profiler.py:80:start_profile] Flops profiler started


In [None]:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)

In [None]:
from transformers import pipeline
# Replace this with your own checkpoint
model_checkpoint = output_dir # local folder for model checkpoint
token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="simple")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

In [None]:
flops_list