# Token classification with LoRA
Done:
- count flops via [flops-profiler](https://pypi.org/project/flops-profiler/) in accelerate loop

ToDo:
- backup (always – never finished)
- use LoRA model
- find batch size automatically
- add wandb sweep
- tidy up

In [1]:
import torch
import numpy as np
import pandas as pd
from peft import PeftModel, LoraConfig, prepare_model_for_int8_training, get_peft_model, get_peft_model, LoraConfig, TaskType
from torch.utils.data import DataLoader
from torch.optim import AdamW
from datasets import load_dataset, concatenate_datasets, DatasetDict
from transformers import pipeline, AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, get_scheduler
from evaluate import load
from accelerate import Accelerator
from tqdm.auto import tqdm
from deepspeed.profiling.flops_profiler import FlopsProfiler
fewnerd = load_dataset("DFKI-SLT/few-nerd", "supervised")
fewnerd_all = concatenate_datasets([fewnerd["train"], fewnerd["validation"], fewnerd["test"]]).rename_column("tokens", "words")
fewnerd_all

[2024-02-24 19:04:57,097] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Dataset({
    features: ['id', 'words', 'ner_tags', 'fine_ner_tags'],
    num_rows: 188239
})

In [2]:
x = 104 # => 188024 (104 works, 105 doesn't work [CUDA OOM])
fewnerd_all = fewnerd_all.filter(lambda example: len(example["words"])<=x)
fewnerd_all

Dataset({
    features: ['id', 'words', 'ner_tags', 'fine_ner_tags'],
    num_rows: 188024
})

In [3]:
idx = 22
fewnerd_all[idx]["words"], fewnerd_all[idx]["ner_tags"]

(['Known',
  'locally',
  'as',
  '``',
  'Fairbottom',
  'Bobs',
  '``',
  'it',
  'is',
  'now',
  'preserved',
  'at',
  'the',
  'Henry',
  'Ford',
  'Museum',
  'in',
  'Dearborn',
  ',',
  'Michigan',
  '.'],
 [0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 0, 4, 0])

In [4]:
label_names = fewnerd_all.features["ner_tags"].feature.names
print(label_names)
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
id2label, label2id

['O', 'art', 'building', 'event', 'location', 'organization', 'other', 'person', 'product']


({'0': 'O',
  '1': 'art',
  '2': 'building',
  '3': 'event',
  '4': 'location',
  '5': 'organization',
  '6': 'other',
  '7': 'person',
  '8': 'product'},
 {'O': '0',
  'art': '1',
  'building': '2',
  'event': '3',
  'location': '4',
  'organization': '5',
  'other': '6',
  'person': '7',
  'product': '8'})

In [5]:
words = fewnerd_all[idx]["words"]
labels = fewnerd_all[idx]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)

Known locally as `` Fairbottom Bobs    `` it is now preserved at the Henry    Ford     Museum   in Dearborn , Michigan . 
O     O       O  O  product    product O  O  O  O   O         O  O   building building building O  location O location O 


In [6]:
model_checkpoint = "FacebookAI/roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
print(tokenizer)
tokenizer.is_fast

RobertaTokenizerFast(name_or_path='FacebookAI/roberta-large', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}


True

In [7]:
inputs = tokenizer(fewnerd_all[idx]["words"], is_split_into_words=True)
inputs.tokens(), inputs.word_ids()

(['<s>',
  'ĠKnown',
  'Ġlocally',
  'Ġas',
  'Ġ``',
  'ĠFair',
  'bottom',
  'ĠBob',
  's',
  'Ġ``',
  'Ġit',
  'Ġis',
  'Ġnow',
  'Ġpreserved',
  'Ġat',
  'Ġthe',
  'ĠHenry',
  'ĠFord',
  'ĠMuseum',
  'Ġin',
  'ĠDear',
  'born',
  'Ġ,',
  'ĠMichigan',
  'Ġ.',
  '</s>'],
 [None,
  0,
  1,
  2,
  3,
  4,
  4,
  5,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  17,
  18,
  19,
  20,
  None])

In [8]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels

labels = fewnerd_all[22]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 0, 4, 0]
[-100, 0, 0, 0, 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 4, 0, 4, 0, -100]


In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

fewnerd_all_tokenized = fewnerd_all.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=fewnerd_all.column_names
)
fewnerd_all_tokenized

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 188024
})

In [10]:
# make splits
dev_split = fewnerd_all_tokenized.train_test_split(test_size=4)["test"]
trainvalid_test_splits = fewnerd_all_tokenized.train_test_split(test_size=0.15)
trainvalid_split = trainvalid_test_splits["train"]
test_split_100 = trainvalid_test_splits["test"]
test_split_10 = test_split_100.train_test_split(test_size=0.1)["test"]
test_split_1 = test_split_10.train_test_split(test_size=0.1)["test"]
train_valid_split = trainvalid_split.train_test_split(test_size=0.15)
train_split_100 = train_valid_split["train"]
train_split_10 = train_split_100.train_test_split(test_size=0.1)["test"]
train_split_1 = train_split_10.train_test_split(test_size=0.1)["test"]
valid_split_100 = train_valid_split["test"]
valid_split_10 = valid_split_100.train_test_split(test_size=0.1)["test"]
valid_split_1 = valid_split_10.train_test_split(test_size=0.1)["test"]
fewnerd_ds = DatasetDict({
    "train_100": train_split_100,
    "valid_100": valid_split_100,
    "test_100": test_split_100,
    "train_10": train_split_10,
    "valid_10": valid_split_10,
    "test_10": test_split_10,
    "train_1": train_split_1,
    "valid_1": valid_split_1,
    "test_1": test_split_1,
    "dev": dev_split
})
fewnerd_ds

DatasetDict({
    train_100: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 135847
    })
    valid_100: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 23973
    })
    test_100: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 28204
    })
    train_10: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 13585
    })
    valid_10: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2398
    })
    test_10: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2821
    })
    train_1: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1359
    })
    valid_1: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 240
    })
    test_1: Dataset({
        features: ['input_ids', 'attentio

In [11]:
instance = fewnerd_ds["dev"][0]
keys = instance.keys()
print(keys)
for key in keys:
    print(len(instance[key]))

dict_keys(['input_ids', 'attention_mask', 'labels'])
67
67
67


In [12]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
data_collator

DataCollatorForTokenClassification(tokenizer=RobertaTokenizerFast(name_or_path='FacebookAI/roberta-large', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multipl

In [13]:
batch = data_collator([fewnerd_ds["train_1"][i] for i in range(2)])
batch["labels"]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    4,    0,
            0,    5,    6,    5,    5,    6,    6,    0,    0,    5,    6,    6,
            6,    6,    0, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100]])

In [14]:
labels

[0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 0, 4, 0]

In [15]:
metric = load("seqeval")

In [16]:
#labels = fewnerd_ds["train"][0]["labels"]
#labels = [label_names[i] for i in labels]
#labels

In [17]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [18]:
dev = True
if dev:
    train_dataloader = DataLoader(fewnerd_ds["dev"], shuffle=True, collate_fn=data_collator, batch_size=8) # 8, 4, 2
    eval_dataloader = DataLoader(fewnerd_ds["dev"], collate_fn=data_collator, batch_size=8)
else:
    train_dataloader = DataLoader(fewnerd_ds["train_10"], shuffle=True, collate_fn=data_collator, batch_size=8) # 8, 4, 2
    eval_dataloader = DataLoader(fewnerd_ds["valid_10"], collate_fn=data_collator, batch_size=8)

In [19]:
# datasets:      3 values [1%, 10%, 100%]
# lora_rank:    10 values [1, ..., 512]
# lora_dropout:  5 values [0, 0.1, 0.2, 0.3, 0.4]
# lora_bias:     3 values ["all", "none", "lora_only"]

r = 4
config = LoraConfig(
    # GUIDE   => https://huggingface.co/docs/peft/main/en/conceptual_guides/lora#common-lora-parameters-in-peft
    # https://huggingface.co/docs/peft/main/en/conceptual_guides/lora#common-lora-parameters-in-peft:~:text=use_rslora%3A%20When%20set%20to%20True%2C%20uses%20Rank%2DStabilized%20LoRA%20which%20sets%20the%20adapter%20scaling%20factor
    # https://arxiv.org/abs/2312.03732, 
    r=r,
    target_modules=["query", "key", "value", "query_proj", "key_proj", "value_proj"],
    bias="lora_only",
    use_rslora=True,
    task_type=TaskType.TOKEN_CLS,
    lora_dropout=0.2
)
print(config)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
    load_in_8bit=True,
    device_map="auto"
)
model = prepare_model_for_int8_training(model)
model = get_peft_model(model, config)

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.TOKEN_CLS: 'TOKEN_CLS'>, inference_mode=False, r=4, target_modules={'query_proj', 'key', 'query', 'value', 'value_proj', 'key_proj'}, lora_alpha=8, lora_dropout=0.2, fan_in_fan_out=False, bias='lora_only', use_rslora=True, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={})


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


```
model = LlamaForTokenClassification.from_pretrained(
    model_id, num_labels=len(label2id), id2label=id2label, label2id=label2id
).bfloat16()
peft_config = LoraConfig(task_type=TaskType.TOKEN_CLS, inference_mode=False, r=lora_r, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
```

In [20]:
optimizer = AdamW(model.parameters(), lr=2e-5)
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model,
    optimizer,
    train_dataloader,
    eval_dataloader
)

In [21]:
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [22]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [23]:
output_dir = "logs"

prof = FlopsProfiler(model) # deepspeed profiler
profile_step = 5
flops_list = []

progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
    # Training
    model.train()
    prof.start_profile() # start profiling
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    #
    prof.stop_profile() # stop profiling
    total_flops = prof.get_total_flops()
    flops_list.append(total_flops)
    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)
        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)
    results = metric.compute()
    print(f"epoch {epoch}:", {key: results[f"overall_{key}"] for key in ["precision", "recall", "f1", "accuracy"]})
    # Save model and tokenizer
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
#
prof.end_profile() # end profiling

  0%|          | 0/3 [00:00<?, ?it/s]

[2024-02-24 19:05:03,132] [INFO] [profiler.py:80:start_profile] Flops profiler started


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


epoch 0: {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'accuracy': 0.14678899082568808}
[2024-02-24 19:05:04,991] [INFO] [profiler.py:80:start_profile] Flops profiler started




epoch 1: {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'accuracy': 0.14220183486238533}
[2024-02-24 19:05:06,199] [INFO] [profiler.py:80:start_profile] Flops profiler started
epoch 2: {'precision': 0.034482758620689655, 'recall': 0.00819672131147541, 'f1': 0.013245033112582783, 'accuracy': 0.15137614678899083}
[2024-02-24 19:05:07,266] [INFO] [profiler.py:226:end_profile] Flops profiler finished


In [24]:
i_max = -1
len_max = -1
for i in range(fewnerd_all_tokenized.num_rows):
    len_i = len(fewnerd_all_tokenized[i]["input_ids"]) # the length of "input_ids" (=> tokenized) may exceeld the length of "words"
    if len_i > len_max:
        i_max = i
        len_max = len_i
i_max, len_max

(176228, 236)

In [25]:
from datasets import Dataset
i_max = -1
len_max = -1
for i in range(fewnerd_all_tokenized.num_rows):
    len_i = len(fewnerd_all_tokenized[i]["input_ids"]) # the length of "input_ids" (=> tokenized) may exceeld the length of "words"
    if len_i > len_max:
        i_max = i
        len_max = len_i
print(i_max, len_max)
#
instance_max = fewnerd_all_tokenized[i_max]
# https://stackoverflow.com/questions/66266232/pandas-collapse-values-of-columns-to-lists
df = pd.DataFrame.from_dict(instance_max)
df = df.stack().reset_index(level=0, drop=True)
df = df.groupby(df.index).apply(list).to_frame().transpose()
instance_max_ds = Dataset.from_pandas(df)
instance_max_ds

176228 236


Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 1
})

In [26]:
len(concatenate_datasets([instance_max_ds, instance_max_ds]))

2

In [27]:
for batch in train_dataloader:
    outputs = model(**batch)
    loss = outputs.loss
    accelerator.backward(loss)
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()



In [52]:
import copy, gc
#
instance_max = fewnerd_all_tokenized[i_max]
# https://stackoverflow.com/questions/66266232/pandas-collapse-values-of-columns-to-lists
df = pd.DataFrame.from_dict(instance_max)
df = df.stack().reset_index(level=0, drop=True)
df = df.groupby(df.index).apply(list).to_frame().transpose()

#
bs_max = 0
bs_model = copy.copy(model)
bs_model.train()
bs_datacollator = copy.copy(data_collator)
bs_break = False
for bs_i in range(10):
    if bs_break==False:
        bs_batch_size = 2**bs_i
        # build dataset of correct size
        if bs_i==0:
            instance_max_ds = Dataset.from_pandas(df)
        else:
            instance_max_ds = concatenate_datasets([instance_max_ds, instance_max_ds])
        # define dataloader
        bs_dataloader = DataLoader(instance_max_ds, collate_fn=bs_datacollator, batch_size=bs_batch_size) # 8, 4, 2
        assert bs_batch_size==len(instance_max_ds)
        for bs_batch in bs_dataloader:
            try:
                bs_outputs = bs_model(**bs_batch)
                bs_loss = bs_outputs.loss
                accelerator.backward(bs_loss)
                optimizer.step()
                optimizer.zero_grad()
                bs_max = bs_batch_size
                print(f"bs_max={bs_max} works!")
            except:
                del instance_max, df, bs_model, bs_datacollator
                gc.collect()
                bs_break = True
                break
bs_max



bs_max=1 works!
bs_max=2 works!
bs_max=4 works!
bs_max=8 works!
bs_max=16 works!
bs_max=32 works!
bs_max=64 works!
bs_max=128 works!


128

In [44]:
bs_outputs.keys()

odict_keys(['loss', 'logits'])

In [29]:
np_flops_list = np.array(flops_list)
print(flops_list)
np.mean(np_flops_list), np.sum(np_flops_list)

[11134573568, 11143011328, 11090697216]


(11122760704.0, 33368282112)

In [30]:
model_checkpoint = output_dir # local folder for model checkpoint
token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="simple")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[{'entity_group': 'LABEL_1',
  'score': 0.6599767,
  'word': 'My name is Sylvain and I work at Hugging Face in Brooklyn.',
  'start': 0,
  'end': 58}]

In [31]:
for i in range(10):
    rank = 2**i
    print(rank)

1
2
4
8
16
32
64
128
256
512


In [32]:
# get largest instance
# use that instance to make a batch of size 2**n
# try to feed that batch through the model
# update batch to size 2**(n+1)

In [33]:
i_max = -1
len_max = -1
for i in range(fewnerd_all_tokenized.num_rows):
    len_i = len(fewnerd_all_tokenized[i]["input_ids"]) # the length of "input_ids" (=> tokenized) may exceeld the length of "words"
    if len_i > len_max:
        i_max = i
        len_max = len_i
i_max, len_max

(176228, 236)

In [34]:
instance_max = fewnerd_all_tokenized[i_max]
# https://stackoverflow.com/questions/66266232/pandas-collapse-values-of-columns-to-lists
df = pd.DataFrame.from_dict(instance_max)
df = df.stack().reset_index(level=0, drop=True)
df = df.groupby(df.index).apply(list).to_frame().transpose()
instance_max_ds = Dataset.from_pandas(df)
instance_max_ds

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 1
})

In [35]:
#instance_max_ds = concatenate_datasets([instance_max_ds, instance_max_ds])
#instance_max_ds

In [36]:
instance_max_ds.shape

(1, 3)

In [37]:
instance_max = fewnerd_all_tokenized[i_max]
# make a loop
for i in range(10):
# on each iteration of the loop:
    ## get batch_size
    batch_size = 2**i
    ## make batch of size batch_size
    instance_max_batch = concatenate_datasets([instance_max_ds for _ in range(batch_size)])
    print(f"\ni={i}\nbatch_size={batch_size}\n{instance_max_batch.shape}")
    ## make a dataloader that yields a batch of the according batch_size
    instance_max_dataloader = DataLoader(instance_max_batch, collate_fn=data_collator, batch_size=batch_size)
    model.train()
    for batch in instance_max_dataloader:
        print(batch["attention_mask"].shape)
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
    
    print(instance_max_dataloader)
## try to send the batch of size batch_size through the model
### if success => update max batch_size and previous_batch_size
### if not (=> except; or other), handle the exception and return the previous_batch_size (=> max_batch_size)
for i in range(10):
    batch_size = 2**i
    batch = [instance_max for _ in range(batch_size)]
    #print(len(batch))


i=0
batch_size=1
(1, 3)
torch.Size([1, 236])




<torch.utils.data.dataloader.DataLoader object at 0x7fc282f91ed0>

i=1
batch_size=2
(2, 3)
torch.Size([2, 236])
<torch.utils.data.dataloader.DataLoader object at 0x7fc282f91030>

i=2
batch_size=4
(4, 3)
torch.Size([4, 236])
<torch.utils.data.dataloader.DataLoader object at 0x7fc2802478b0>

i=3
batch_size=8
(8, 3)
torch.Size([8, 236])
<torch.utils.data.dataloader.DataLoader object at 0x7fc2802457e0>

i=4
batch_size=16
(16, 3)
torch.Size([16, 236])
<torch.utils.data.dataloader.DataLoader object at 0x7fc287f72500>

i=5
batch_size=32
(32, 3)
torch.Size([32, 236])
<torch.utils.data.dataloader.DataLoader object at 0x7fc280244d90>

i=6
batch_size=64
(64, 3)
torch.Size([64, 236])
<torch.utils.data.dataloader.DataLoader object at 0x7fc282f91030>

i=7
batch_size=128
(128, 3)
torch.Size([128, 236])
<torch.utils.data.dataloader.DataLoader object at 0x7fc282f92e60>

i=8
batch_size=256
(256, 3)
torch.Size([256, 236])


OutOfMemoryError: CUDA out of memory. Tried to allocate 872.00 MiB. GPU 0 has a total capacty of 11.77 GiB of which 152.31 MiB is free. Including non-PyTorch memory, this process has 11.61 GiB memory in use. Of the allocated memory 8.05 GiB is allocated by PyTorch, and 2.59 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
instance_max = fewnerd_all_tokenized[i_max]
# make a loop
for i in range(10):
# on each iteration of the loop:
    ## get batch_size
    batch_size = 2**i
    ## make batch of size batch_size
    instance_max_batch = concatenate_datasets([instance_max_ds for _ in range(batch_size)])
    print(f"\ni={i}\nbatch_size={batch_size}\n{instance_max_batch.shape}")
    ## make a dataloader that yields a batch of the according batch_size
    instance_max_dataloader = DataLoader(instance_max_batch, collate_fn=data_collator, batch_size=batch_size)
    model.train()
    for batch in instance_max_dataloader:
        print(batch["attention_mask"].shape[0])
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
    
    print(instance_max_dataloader)
## try to send the batch of size batch_size through the model
### if success => update max batch_size and previous_batch_size
### if not (=> except; or other), handle the exception and return the previous_batch_size (=> max_batch_size)
for i in range(10):
    batch_size = 2**i
    batch = [instance_max for _ in range(batch_size)]
    #print(len(batch))

In [None]:
# https://stackoverflow.com/questions/66266232/pandas-collapse-values-of-columns-to-lists
df = pd.DataFrame.from_dict(instance_max)
df = df.stack().reset_index(level=0, drop=True)
df = df.groupby(df.index).apply(list).to_frame().transpose()
batch_size_ds = Dataset.from_pandas(df)
batch_size_ds

In [None]:
df = pd.concat([df, df])
df

In [None]:
df2 = df
df2 + df2

In [None]:
df = pd.DataFrame.from_dict(instance_max)
df

In [None]:
fewnerd_ds

In [None]:
batch = data_collator([fewnerd_ds["train_100"][i] for i in range(2)])
batch

In [None]:
batch["input_ids"]

In [None]:
model(batch)

In [None]:
batch[1] = batch[0]
len(batch)

In [None]:
model(**batch)

In [None]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(**inputs, labels=labels)

https://huggingface.co/docs/transformers/main_classes/output

In [None]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(**inputs)
outputs

In [None]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
model(**inputs)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")

inputs = tokenizer("mine, too", return_tensors="pt")
inputs

In [None]:
model(**inputs)

In [None]:
try_dataloader = DataLoader(fewnerd_ds["train_10"], shuffle=True, collate_fn=data_collator, batch_size=8)
try_dataloader[0]

In [None]:
data_loader = DataLoader(dataset, batch_size=i)
data_loader

In [None]:
type(instance_max)

In [None]:
instance_max

In [None]:
import pandas as pd
df = pd.DataFrame.from_dict(instance_max, orient="index")
df

In [None]:
df2 = pd.DataFrame.from_records(instance_max)
df2

In [None]:
d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
pd.DataFrame(data=d, index=[0, 1, 2, 3])

In [None]:
from datasets import Dataset
ds = Dataset.from_dict({"pokemon": ["bulbasaur", "squirtle"], "type": ["grass", "water"]})
ds

In [None]:
Dataset.from_dict({})

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import wandb

###
dataset = fewnerd_all_tokenized
###
# Example: Loading a pre-trained model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Define your hardware constraints
max_memory = 16 * 1024  # Example: maximum memory in megabytes
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define your starting and maximum batch size
start_batch_size = 16
max_batch_size = 512

# Define a function to check if a batch size is valid
def is_valid_batch_size(batch_size):
    try:
        data_loader = DataLoader(dataset, batch_size=batch_size)
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    except RuntimeError as e:
        if "out of memory" in str(e):
            return False
        else:
            raise
    return True

# Perform a binary search to find the largest valid batch size
left, right = start_batch_size, max_batch_size
while left < right:
    mid = (left + right) // 2
    if is_valid_batch_size(mid):
        left = mid + 1
    else:
        right = mid

largest_valid_batch_size = left - 1

# Log the result using Weights & Biases
wandb.log({"largest_valid_batch_size": largest_valid_batch_size})
print("Largest valid batch size:", largest_valid_batch_size)

In [None]:
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.nn.utils.rnn import pad_sequence
import torch

# Example: Loading a pre-trained model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Assuming you have a dataset where each sample is a dictionary containing 'input_ids', 'attention_mask', and 'labels'
# Each of these elements could be lists of varying lengths

# Padding function
def collate_fn(batch):
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    attention_mask = [torch.tensor(item['attention_mask']) for item in batch]
    labels = [torch.tensor(item['labels']) for item in batch]
    
    # Pad sequences to the same length
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)  # Assuming tokenizer.pad_token_id is 0
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # Assuming -100 is the ignore_index for loss calculation
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Assuming you have a DataLoader named 'data_loader'
data_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

# Get a single batch from the DataLoader
for batch in data_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    
    # Pass the batch through the model
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    
    # You can then do whatever you need with the outputs

In [None]:
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.nn.utils.rnn import pad_sequence
import torch

# Example: Loading a pre-trained model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Define batch size
batch_size = 8

# Assuming you have a dataset where each sample is a dictionary containing 'input_ids', 'attention_mask', and 'labels'
# Each of these elements could be lists of varying lengths

# Padding function
def collate_fn(batch):
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    attention_mask = [torch.tensor(item['attention_mask']) for item in batch]
    labels = [torch.tensor(item['labels']) for item in batch]
    
    # Pad sequences to the same length
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)  # Assuming tokenizer.pad_token_id is 0
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # Assuming -100 is the ignore_index for loss calculation
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Assuming you have a DataLoader named 'data_loader'
data_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

# Get a single batch from the DataLoader
for batch in data_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    
    # Pass the batch through the model
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    
    # You can then do whatever you need with the outputs

In [None]:
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.nn.utils.rnn import pad_sequence
import torch

# Example: Loading a pre-trained model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Define batch size
batch_size = 8

# Assuming you have a dataset where each sample is a dictionary containing 'input_text', 'labels'
# Each of these elements could be lists of varying lengths

# Tokenize function
def tokenize_batch(batch):
    inputs = tokenizer(batch['input_text'], return_tensors="pt", padding=True, truncation=True)
    labels = batch['labels']
    return inputs, labels

# Padding function
def collate_fn(batch):
    input_texts = [item['input_text'] for item in batch]
    labels = [item['labels'] for item in batch]
    
    inputs, labels = tokenize_batch(batch)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Assuming you have a DataLoader named 'data_loader'
data_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

# Get a single batch from the DataLoader
for batch in data_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    
    # Pass the batch through the model
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    
    # You can then do whatever you need with the outputs

In [None]:
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Example: Loading a pre-trained model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Define batch size
batch_size = 8

# Assuming you have a DataLoader named 'data_loader' and the dataset is already tokenized

# No need for a collate function since the dataset is already tokenized
data_loader = DataLoader(dataset, batch_size=batch_size)

# Get a single batch from the DataLoader
for batch in data_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    
    # Pass the batch through the model
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    
    # You can then do whatever you need with the outputs

In [None]:
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.nn.utils.rnn import pad_sequence
import torch

# Example: Loading a pre-trained model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Define batch size
batch_size = 8

# Assuming you have a DataLoader named 'data_loader' and the dataset is already tokenized

# Padding function
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]
    
    # Pad labels to the same length
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # Assuming -100 is the ignore_index for loss calculation
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Assuming you have a DataLoader named 'data_loader'
data_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

# Get a single batch from the DataLoader
for batch in data_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    
    # Pass the batch through the model
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    
    # You can then do whatever you need with the outputs

In [None]:
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.nn.utils.rnn import pad_sequence
import torch

# Example: Loading a pre-trained model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Define batch size
batch_size = 8

# Assuming you have a DataLoader named 'data_loader' and the dataset is already tokenized

# Padding function
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [torch.tensor(item['labels']) for item in batch]  # Convert list of lists to list of tensors
    
    # Pad labels to the same length
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # Assuming -100 is the ignore_index for loss calculation
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Assuming you have a DataLoader named 'data_loader'
data_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

# Get a single batch from the DataLoader
for batch in data_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    
    # Pass the batch through the model
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    
    # You can then do whatever you need with the outputs

In [None]:
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.nn.utils.rnn import pad_sequence
import torch

# Example: Loading a pre-trained model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Define batch size
batch_size = 8

# Assuming you have a DataLoader named 'data_loader' and the dataset is already tokenized

# Padding function
def collate_fn(batch):
    input_ids = [torch.tensor(item['input_ids']) for item in batch]  # Convert to tensor
    attention_mask = [torch.tensor(item['attention_mask']) for item in batch]  # Convert to tensor
    labels = [torch.tensor(item['labels']) for item in batch]  # Convert to tensor
    
    # Pad labels to the same length
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # Assuming -100 is the ignore_index for loss calculation
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Assuming you have a DataLoader named 'data_loader'
data_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

# Get a single batch from the DataLoader
for batch in data_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    
    # Pass the batch through the model
    outputs = model(input_ids=torch.stack(input_ids), attention_mask=torch.stack(attention_mask), labels=labels)
    
    # You can then do whatever you need with the outputs

In [None]:
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification
from torch.nn.utils.rnn import pad_sequence
import torch

# Example: Loading a pre-trained model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Define batch size
batch_size = 8

# Assuming you have a DataLoader named 'data_loader' and the dataset is already tokenized

# Padding function
def collate_fn(batch):
    input_ids = [torch.tensor(item['input_ids']) for item in batch]  # Convert to tensor
    attention_mask = [torch.tensor(item['attention_mask']) for item in batch]  # Convert to tensor
    labels = [torch.tensor(item['labels']) for item in batch]  # Convert to tensor
    
    # Pad sequences to the same length
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)  # Assuming 0 is the pad value for attention mask
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # Assuming -100 is the ignore_index for loss calculation
    
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# Assuming you have a DataLoader named 'data_loader'
data_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

# Get a single batch from the DataLoader
for batch in data_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels']
    
    # Pass the batch through the model
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    
    # You can then do whatever you need with the outputs

In [None]:
dataloader = try_dataloader

In [None]:
#from transformers import AutoModelForSequenceClassification, AutoTokenizer
#import torch

# Assuming 'dataloader' is your DataLoader instance
# and 'model' is your loaded model

# Move model to the appropriate device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set model to evaluation mode if you are not training
model.eval()

# Fetch a single batch from the DataLoader
batch = next(iter(dataloader))

# Assuming your batch is a dictionary with input_ids, attention_mask, and labels
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device) # If you have labels

# Forward pass, get model predictions
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

# Process your outputs as needed (e.g., extract logits)
logits = outputs.logits
