# Token classification with LoRA
Done:
- count flops via [flops-profiler](https://pypi.org/project/flops-profiler/) in accelerate loop

ToDo:
- backup (always – never finished)
- use LoRA model
- find batch size automatically
- add wandb sweep
- tidy up

In [1]:
import torch
import numpy as np
from torch.utils.data import DataLoader
from torch.optim import AdamW
from datasets import load_dataset, concatenate_datasets, DatasetDict
from transformers import pipeline, AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, get_scheduler
from evaluate import load
from accelerate import Accelerator
from tqdm.auto import tqdm
from deepspeed.profiling.flops_profiler import FlopsProfiler
fewnerd = load_dataset("DFKI-SLT/few-nerd", "supervised")
fewnerd_all = concatenate_datasets([fewnerd["train"], fewnerd["validation"], fewnerd["test"]]).rename_column("tokens", "words")
fewnerd_all

[2024-02-23 04:05:38,890] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Dataset({
    features: ['id', 'words', 'ner_tags', 'fine_ner_tags'],
    num_rows: 188239
})

In [2]:
x = 104 # => 188024 (104 works, 105 doesn't work [CUDA OOM])
fewnerd_all = fewnerd_all.filter(lambda example: len(example["words"])<=x)
fewnerd_all

Dataset({
    features: ['id', 'words', 'ner_tags', 'fine_ner_tags'],
    num_rows: 188024
})

In [3]:
idx = 22
fewnerd_all[idx]["words"], fewnerd_all[idx]["ner_tags"]

(['Known',
  'locally',
  'as',
  '``',
  'Fairbottom',
  'Bobs',
  '``',
  'it',
  'is',
  'now',
  'preserved',
  'at',
  'the',
  'Henry',
  'Ford',
  'Museum',
  'in',
  'Dearborn',
  ',',
  'Michigan',
  '.'],
 [0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 0, 4, 0])

In [4]:
label_names = fewnerd_all.features["ner_tags"].feature.names
print(label_names)
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}
id2label, label2id

['O', 'art', 'building', 'event', 'location', 'organization', 'other', 'person', 'product']


({'0': 'O',
  '1': 'art',
  '2': 'building',
  '3': 'event',
  '4': 'location',
  '5': 'organization',
  '6': 'other',
  '7': 'person',
  '8': 'product'},
 {'O': '0',
  'art': '1',
  'building': '2',
  'event': '3',
  'location': '4',
  'organization': '5',
  'other': '6',
  'person': '7',
  'product': '8'})

In [5]:
words = fewnerd_all[idx]["words"]
labels = fewnerd_all[idx]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)
print(line1)
print(line2)

Known locally as `` Fairbottom Bobs    `` it is now preserved at the Henry    Ford     Museum   in Dearborn , Michigan . 
O     O       O  O  product    product O  O  O  O   O         O  O   building building building O  location O location O 


In [6]:
model_checkpoint = "FacebookAI/roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
print(tokenizer)
tokenizer.is_fast

RobertaTokenizerFast(name_or_path='FacebookAI/roberta-large', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}


True

In [7]:
inputs = tokenizer(fewnerd_all[idx]["words"], is_split_into_words=True)
inputs.tokens(), inputs.word_ids()

(['<s>',
  'ĠKnown',
  'Ġlocally',
  'Ġas',
  'Ġ``',
  'ĠFair',
  'bottom',
  'ĠBob',
  's',
  'Ġ``',
  'Ġit',
  'Ġis',
  'Ġnow',
  'Ġpreserved',
  'Ġat',
  'Ġthe',
  'ĠHenry',
  'ĠFord',
  'ĠMuseum',
  'Ġin',
  'ĠDear',
  'born',
  'Ġ,',
  'ĠMichigan',
  'Ġ.',
  '</s>'],
 [None,
  0,
  1,
  2,
  3,
  4,
  4,
  5,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  17,
  18,
  19,
  20,
  None])

In [8]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels

labels = fewnerd_all[22]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 0, 4, 0]
[-100, 0, 0, 0, 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 4, 0, 4, 0, -100]


In [9]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

fewnerd_all_tokenized = fewnerd_all.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=fewnerd_all.column_names
)
fewnerd_all_tokenized

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 188024
})

In [10]:
# make splits
dev_split = fewnerd_all_tokenized.train_test_split(test_size=4)["test"]
trainvalid_test_splits = fewnerd_all_tokenized.train_test_split(test_size=0.15)
trainvalid_split = trainvalid_test_splits["train"]
test_split_100 = trainvalid_test_splits["test"]
test_split_10 = test_split_100.train_test_split(test_size=0.1)["test"]
test_split_1 = test_split_10.train_test_split(test_size=0.1)["test"]
train_valid_split = trainvalid_split.train_test_split(test_size=0.15)
train_split_100 = train_valid_split["train"]
train_split_10 = train_split_100.train_test_split(test_size=0.1)["test"]
train_split_1 = train_split_10.train_test_split(test_size=0.1)["test"]
valid_split_100 = train_valid_split["test"]
valid_split_10 = valid_split_100.train_test_split(test_size=0.1)["test"]
valid_split_1 = valid_split_10.train_test_split(test_size=0.1)["test"]
fewnerd_ds = DatasetDict({
    "train_100": train_split_100,
    "valid_100": valid_split_100,
    "test_100": test_split_100,
    "train_10": train_split_10,
    "valid_10": valid_split_10,
    "test_10": test_split_10,
    "train_1": train_split_1,
    "valid_1": valid_split_1,
    "test_1": test_split_1,
    "dev": dev_split
})
fewnerd_ds

DatasetDict({
    train_100: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 135847
    })
    valid_100: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 23973
    })
    test_100: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 28204
    })
    train_10: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 13585
    })
    valid_10: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2398
    })
    test_10: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2821
    })
    train_1: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1359
    })
    valid_1: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 240
    })
    test_1: Dataset({
        features: ['input_ids', 'attentio

In [11]:
instance = fewnerd_ds["dev"][0]
keys = instance.keys()
print(keys)
for key in keys:
    print(len(instance[key]))

dict_keys(['input_ids', 'attention_mask', 'labels'])
21
21
21


In [12]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
data_collator

DataCollatorForTokenClassification(tokenizer=RobertaTokenizerFast(name_or_path='FacebookAI/roberta-large', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multipl

In [13]:
batch = data_collator([fewnerd_ds["train_1"][i] for i in range(2)])
batch["labels"]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    2,    2,    2,    0,    2,    2,    2,    2,    0,    0,    0,
            0,    2,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    2,    0, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    4,    4,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0, -100]])

In [14]:
labels

[0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 0, 4, 0]

In [15]:
metric = load("seqeval")

In [16]:
#labels = fewnerd_ds["train"][0]["labels"]
#labels = [label_names[i] for i in labels]
#labels

In [17]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [18]:
dev = False
if dev:
    train_dataloader = DataLoader(fewnerd_ds["dev"], shuffle=True, collate_fn=data_collator, batch_size=8) # 8, 4, 2
    eval_dataloader = DataLoader(fewnerd_ds["dev"], collate_fn=data_collator, batch_size=8)
else:
    train_dataloader = DataLoader(fewnerd_ds["train_10"], shuffle=True, collate_fn=data_collator, batch_size=8) # 8, 4, 2
    eval_dataloader = DataLoader(fewnerd_ds["valid_10"], collate_fn=data_collator, batch_size=8)

In [19]:
import math
from peft import PeftModel
from peft import LoraConfig, prepare_model_for_int8_training, get_peft_model
from peft import get_peft_model, LoraConfig, TaskType
# datasets:      3 values [1%, 10%, 100%]
# lora_rank:    10 values [1, ..., 512]
# lora_dropout:  5 values [0, 0.1, 0.2, 0.3, 0.4]
# lora_bias:     3 values ["all", "none", "lora_only"]

r = 4
config = LoraConfig(
    # GUIDE   => https://huggingface.co/docs/peft/main/en/conceptual_guides/lora#common-lora-parameters-in-peft
    # https://huggingface.co/docs/peft/main/en/conceptual_guides/lora#common-lora-parameters-in-peft:~:text=use_rslora%3A%20When%20set%20to%20True%2C%20uses%20Rank%2DStabilized%20LoRA%20which%20sets%20the%20adapter%20scaling%20factor
    # https://arxiv.org/abs/2312.03732, 
    r=r,
    target_modules=["query", "key", "value", "query_proj", "key_proj", "value_proj"],
    bias="lora_only",
    use_rslora=True,
    task_type=TaskType.TOKEN_CLS,
    lora_dropout=0.2
)
print(config)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
    load_in_8bit=True,
    device_map="auto"
)
model = prepare_model_for_int8_training(model)
model = get_peft_model(model, config)

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.TOKEN_CLS: 'TOKEN_CLS'>, inference_mode=False, r=4, target_modules={'key_proj', 'query_proj', 'value_proj', 'key', 'query', 'value'}, lora_alpha=8, lora_dropout=0.2, fan_in_fan_out=False, bias='lora_only', use_rslora=True, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={})


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


```
model = LlamaForTokenClassification.from_pretrained(
    model_id, num_labels=len(label2id), id2label=id2label, label2id=label2id
).bfloat16()
peft_config = LoraConfig(task_type=TaskType.TOKEN_CLS, inference_mode=False, r=lora_r, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
```

In [20]:
optimizer = AdamW(model.parameters(), lr=2e-5)
accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model,
    optimizer,
    train_dataloader,
    eval_dataloader
)

In [21]:
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
    "cosine",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [22]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()
    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [23]:
output_dir = "logs"

prof = FlopsProfiler(model) # deepspeed profiler
profile_step = 5
flops_list = []

progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
    # Training
    model.train()
    prof.start_profile() # start profiling
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    #
    prof.stop_profile() # stop profiling
    total_flops = prof.get_total_flops()
    flops_list.append(total_flops)
    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)
        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)
    results = metric.compute()
    print(f"epoch {epoch}:", {key: results[f"overall_{key}"] for key in ["precision", "recall", "f1", "accuracy"]})
    # Save model and tokenizer
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
#
prof.end_profile() # end profiling

  0%|          | 0/5097 [00:00<?, ?it/s]

[2024-02-23 04:05:45,458] [INFO] [profiler.py:80:start_profile] Flops profiler started




epoch 0: {'precision': 0.5121317157712305, 'recall': 0.5332731784344688, 'f1': 0.5224886727815228, 'accuracy': 0.8845230426889968}
[2024-02-23 04:19:16,803] [INFO] [profiler.py:80:start_profile] Flops profiler started
epoch 1: {'precision': 0.6522963604852686, 'recall': 0.6286668754567283, 'f1': 0.6402636755090108, 'accuracy': 0.9067166364115476}
[2024-02-23 04:32:43,890] [INFO] [profiler.py:80:start_profile] Flops profiler started
epoch 2: {'precision': 0.6663778162911612, 'recall': 0.6386380151562338, 'f1': 0.6522130930294195, 'accuracy': 0.9090589682028459}
[2024-02-23 04:46:13,362] [INFO] [profiler.py:226:end_profile] Flops profiler finished


In [24]:
np_flops_list = np.array(flops_list)
print(flops_list)
np.mean(np_flops_list), np.sum(np_flops_list)

[13521575735296, 13286347696128, 13394332036864]


(13400751822762.666, 40202255468288)

In [25]:
model_checkpoint = output_dir # local folder for model checkpoint
token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="simple")
token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[{'entity_group': 'LABEL_1',
  'score': 0.63658154,
  'word': 'My name is Sylvain and I work at Hugging Face in Brooklyn.',
  'start': 0,
  'end': 58}]

In [26]:
for i in range(10):
    rank = 2**i
    print(rank)

1
2
4
8
16
32
64
128
256
512


In [27]:
# get largest instance
# use that instance to make a batch of size 2**n
# try to feed that batch through the model
# update batch to size 2**(n+1)

In [37]:
i_max = -1
len_max = -1
for i in range(fewnerd_all_tokenized.num_rows):
    len_i = len(fewnerd_all_tokenized[i]["input_ids"]) # the length of "input_ids" (=> tokenized) may exceeld the length of "words"
    if len_i > len_max:
        i_max = i
        len_max = len_i
i_max, len_max

(176228, 236)

In [145]:
instance_max = fewnerd_all_tokenized[i_max]
# https://stackoverflow.com/questions/66266232/pandas-collapse-values-of-columns-to-lists
df = pd.DataFrame.from_dict(instance_max)
df = df.stack().reset_index(level=0, drop=True)
df = df.groupby(df.index).apply(list).to_frame().transpose()
instance_max_ds = Dataset.from_pandas(df)
instance_max_ds

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 1
})

In [146]:
instance_max_ds = concatenate_datasets([instance_max_ds, instance_max_ds])
instance_max_ds

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 2
})

In [138]:
# make a loop
# on each iteration of the loop:
## get batch_size
## make batch of size batch_size
## make a dataloader that yields a batch of the according batch_size
## try to send the batch of size batch_size through the model
### if success => update max batch_size and previous_batch_size
### if not (=> except; or other), handle the exception and return the previous_batch_size (=> max_batch_size)
for i in range(10):
    batch_size = 2**i
    batch = [instance_max for _ in range(batch_size)]
    print(len(batch))

1
2
4
8
16
32
64
128
256
512
1024
2048


In [134]:
# https://stackoverflow.com/questions/66266232/pandas-collapse-values-of-columns-to-lists
df = pd.DataFrame.from_dict(instance_max)
df = df.stack().reset_index(level=0, drop=True)
df = df.groupby(df.index).apply(list).to_frame().transpose()
batch_size_ds = Dataset.from_pandas(df)
batch_size_ds

Dataset({
    features: ['attention_mask', 'input_ids', 'labels'],
    num_rows: 1
})

In [132]:
df = pd.concat([df, df])
df

Unnamed: 0,attention_mask,input_ids,labels
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 36, 1205, 4832, 21277, 1401, 4, 39367, 4, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 36, 1205, 4832, 21277, 1401, 4, 39367, 4, ...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


Dataset({
    features: ['attention_mask', 'input_ids', 'labels', '__index_level_0__'],
    num_rows: 2
})

In [126]:
df2 = df
df2 + df2

AttributeError: 'DataFrame' object has no attribute 'append'

In [119]:
df = pd.DataFrame.from_dict(instance_max)
df

Unnamed: 0,input_ids,attention_mask,labels
0,0,1,-100
1,36,1,0
2,1205,1,0
3,4832,1,0
4,21277,1,0
...,...,...,...
231,401,1,0
232,298,1,0
233,448,1,0
234,4839,1,0


In [None]:
fewnerd_ds

In [92]:
batch = data_collator([fewnerd_ds["train_100"][i] for i in range(2)])
batch

{'input_ids': tensor([[    0,    96,   928,  2156,  8097, 15022,  2061,    15,  1861, 11075,
             8,  5307,   479,     2,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1],
        [    0,  6916, 10019,   726,    12,   448, 12967,  2401,   718,    36,
          2421,   502,   973,  2156, 13025,  4839,  2156,    67, 29341, 12153,
         10019,   726,    12,   448, 12967,  2401,   718,  2156,    16,   770,
            30,     5,   315,   532,   168,    11,  2748,    19,     5,   502,
           564,  2156,  8008,  2156,   908,    15,     5,  2218, 22468, 26696,
          2632,   583,  8086,   895,  3917,  2156,  2030,  3466,   479,     2]]), 'attention_mask': tensor([[1, 1, 1

In [96]:
batch["input_ids"]

tensor([[    0,    96,   928,  2156,  8097, 15022,  2061,    15,  1861, 11075,
             8,  5307,   479,     2,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1],
        [    0,  6916, 10019,   726,    12,   448, 12967,  2401,   718,    36,
          2421,   502,   973,  2156, 13025,  4839,  2156,    67, 29341, 12153,
         10019,   726,    12,   448, 12967,  2401,   718,  2156,    16,   770,
            30,     5,   315,   532,   168,    11,  2748,    19,     5,   502,
           564,  2156,  8008,  2156,   908,    15,     5,  2218, 22468, 26696,
          2632,   583,  8086,   895,  3917,  2156,  2030,  3466,   479,     2]])

In [94]:
model(batch)

KeyError: 'Invalid key. Only three types of key are available: (1) string, (2) integers for backend Encoding, and (3) slices for data subsetting.'

In [85]:
batch[1] = batch[0]
len(batch)

128

In [60]:
model(**batch)

TypeError: PeftModelForTokenClassification(
  (base_model): LoraModel(
    (model): RobertaForTokenClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 1024, padding_idx=1)
          (position_embeddings): Embedding(514, 1024, padding_idx=1)
          (token_type_embeddings): Embedding(1, 1024)
          (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-23): 24 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): lora.Linear8bitLt(
                    (base_layer): Linear8bitLt(in_features=1024, out_features=1024, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.2, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_features=4, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=4, out_features=1024, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterDict()
                  )
                  (key): lora.Linear8bitLt(
                    (base_layer): Linear8bitLt(in_features=1024, out_features=1024, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.2, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_features=4, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=4, out_features=1024, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterDict()
                  )
                  (value): lora.Linear8bitLt(
                    (base_layer): Linear8bitLt(in_features=1024, out_features=1024, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.2, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_features=4, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=4, out_features=1024, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
                    (lora_embedding_B): ParameterDict()
                  )
                  (dropout): Dropout(p=0.1, inplace=False)
                )
                (output): RobertaSelfOutput(
                  (dense): Linear8bitLt(in_features=1024, out_features=1024, bias=True)
                  (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
                  (dropout): Dropout(p=0.1, inplace=False)
                )
              )
              (intermediate): RobertaIntermediate(
                (dense): Linear8bitLt(in_features=1024, out_features=4096, bias=True)
                (intermediate_act_fn): GELUActivation()
              )
              (output): RobertaOutput(
                (dense): Linear8bitLt(in_features=4096, out_features=1024, bias=True)
                (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
            )
          )
        )
      )
      (dropout): Dropout(p=0.1, inplace=False)
      (classifier): ModulesToSaveWrapper(
        (original_module): Linear(in_features=1024, out_features=9, bias=True)
        (modules_to_save): ModuleDict(
          (default): Linear(in_features=1024, out_features=9, bias=True)
        )
      )
    )
  )
) argument after ** must be a mapping, not list

In [63]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(**inputs, labels=labels)

ValueError: Expected input batch_size (8) to match target batch_size (1).

https://huggingface.co/docs/transformers/main_classes/output

In [56]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(**inputs)
outputs

TokenClassifierOutput(loss={'logits': tensor([[[ 2.3109, -0.1272, -0.0090, -0.2762, -1.0147, -0.7503, -0.0249,
          -0.3052,  0.0560],
         [ 6.0512, -1.1368, -1.8989, -1.9052, -1.4983, -1.7293, -0.5406,
          -0.9525, -0.9822],
         [ 5.8347, -1.3259, -1.7608, -1.3189, -1.6006, -1.4316, -1.3654,
          -1.1167, -1.1731],
         [ 5.2818, -0.8098, -1.9488, -1.7264, -1.1526, -1.4212, -1.2466,
          -1.5055, -0.6716],
         [ 4.5569, -1.2706, -2.3851, -1.7166, -0.6587, -1.2006, -0.7005,
          -1.6966, -0.3249],
         [ 6.0402, -1.7068, -2.1786, -1.8490, -1.4007, -1.5828, -1.1663,
          -1.3350, -1.2373],
         [ 6.0869, -1.6930, -2.3488, -2.0532, -1.4194, -1.6210, -0.9737,
          -1.4418, -1.3136],
         [ 5.3404, -1.3242, -1.8866, -2.5230, -1.2383, -1.3390, -0.8817,
          -1.3000, -0.9799]]], grad_fn=<ToCopyBackward0>)}, logits=tensor([[[ 2.3109, -0.1272, -0.0090, -0.2762, -1.0147, -0.7503, -0.0249,
          -0.3052,  0.0560],
      

In [59]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
model(**inputs)

TokenClassifierOutput(loss={'logits': tensor([[[ 2.3109, -0.1272, -0.0090, -0.2762, -1.0147, -0.7503, -0.0249,
          -0.3052,  0.0560],
         [ 6.0512, -1.1368, -1.8989, -1.9052, -1.4983, -1.7293, -0.5406,
          -0.9525, -0.9822],
         [ 5.8347, -1.3259, -1.7608, -1.3189, -1.6006, -1.4316, -1.3654,
          -1.1167, -1.1731],
         [ 5.2818, -0.8098, -1.9488, -1.7264, -1.1526, -1.4212, -1.2466,
          -1.5055, -0.6716],
         [ 4.5569, -1.2706, -2.3851, -1.7166, -0.6587, -1.2006, -0.7005,
          -1.6966, -0.3249],
         [ 6.0402, -1.7068, -2.1786, -1.8490, -1.4007, -1.5828, -1.1663,
          -1.3350, -1.2373],
         [ 6.0869, -1.6930, -2.3488, -2.0532, -1.4194, -1.6210, -0.9737,
          -1.4418, -1.3136],
         [ 5.3404, -1.3242, -1.8866, -2.5230, -1.2383, -1.3390, -0.8817,
          -1.3000, -0.9799]]], grad_fn=<ToCopyBackward0>)}, logits=tensor([[[ 2.3109, -0.1272, -0.0090, -0.2762, -1.0147, -0.7503, -0.0249,
          -0.3052,  0.0560],
      

In [116]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased")

inputs = tokenizer("mine, too", return_tensors="pt")
inputs

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'input_ids': tensor([[ 101, 3067, 1010, 2205,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [73]:
model(**inputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.1612,  0.0953]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [79]:
try_dataloader = DataLoader(fewnerd_ds["train_10"], shuffle=True, collate_fn=data_collator, batch_size=8)
try_dataloader[0]

TypeError: 'DataLoader' object is not subscriptable

In [98]:
data_loader = DataLoader(dataset, batch_size=i)
data_loader

NameError: name 'dataset' is not defined

In [99]:
type(instance_max)

dict

In [100]:
instance_max

{'input_ids': [0,
  36,
  1205,
  4832,
  21277,
  1401,
  4,
  39367,
  4,
  175,
  73,
  21061,
  17487,
  2231,
  5214,
  119,
  25554,
  2744,
  12161,
  18495,
  2744,
  417,
  35449,
  2744,
  31645,
  22505,
  359,
  28127,
  25606,
  910,
  462,
  329,
  5214,
  134,
  347,
  134,
  9335,
  29395,
  1215,
  225,
  3048,
  38997,
  3048,
  38997,
  359,
  28127,
  25606,
  579,
  1178,
  37959,
  506,
  5214,
  2118,
  242,
  530,
  330,
  40935,
  267,
  119,
  846,
  462,
  406,
  267,
  18313,
  246,
  16972,
  401,
  605,
  176,
  791,
  102,
  791,
  1343,
  401,
  90,
  530,
  448,
  42385,
  673,
  1864,
  35,
  13726,
  2481,
  1646,
  35556,
  38500,
  359,
  28127,
  25606,
  326,
  43531,
  5214,
  13239,
  359,
  28127,
  25606,
  1300,
  5214,
  9060,
  359,
  28127,
  25606,
  1437,
  11726,
  1178,
  5214,
  134,
  359,
  28127,
  25606,
  20018,
  5214,
  401,
  428,
  306,
  11538,
  791,
  257,
  32675,
  4186,
  401,
  298,
  448,
  7606,
  34660,
  347,
  530

In [108]:
import pandas as pd
df = pd.DataFrame.from_dict(instance_max, orient="index")
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,226,227,228,229,230,231,232,233,234,235
input_ids,0,36,1205,4832,21277,1401,4,39367,4,175,...,11538,791,257,32675,4186,401,298,448,4839,2
attention_mask,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
labels,-100,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-100


In [109]:
df2 = pd.DataFrame.from_records(instance_max)
df2

Unnamed: 0,attention_mask,input_ids,labels
0,1,0,-100
1,1,36,0
2,1,1205,0
3,1,4832,0
4,1,21277,0
...,...,...,...
231,1,401,0
232,1,298,0
233,1,448,0
234,1,4839,0


In [110]:
d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
pd.DataFrame(data=d, index=[0, 1, 2, 3])

Unnamed: 0,col1,col2
0,0,
1,1,
2,2,2.0
3,3,3.0


In [114]:
from datasets import Dataset
ds = Dataset.from_dict({"pokemon": ["bulbasaur", "squirtle"], "type": ["grass", "water"]})
ds

Dataset({
    features: ['pokemon', 'type'],
    num_rows: 2
})

In [None]:
Dataset.from_dict({})