```python
outputs = model(ids, mask, token_type_ids) # get outputs
# reshape outputs (make sure it is done correctly, e.g., by feeding 2 instances [torch.ones, torch.zeros]) ...
# ... first separately, and then in a combined way => reshaping format should be clear
loss = loss_fn(outputs, targets)           # get loss
loss.backward()                            # backpropagation
optimizer.step()                           # update parameters
optimizer.zero_grad()                      # clear gradients
```

https://stackoverflow.com/questions/62327803/having-6-labels-instead-of-2-in-hugging-face-bertforsequenceclassification<br>
https://huggingface.co/docs/transformers/main_classes/output<br>
https://www.kaggle.com/code/debarshichanda/bert-multi-label-text-classification/notebook<br>
https://discuss.huggingface.co/t/reshaping-logits-when-using-trainer/18214

In [1]:
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer, DataCollatorWithPadding, AdamW
from datasets import load_dataset
checkpoint = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(checkpoint)
config = BertConfig.from_pretrained(checkpoint)
config.num_labels = 57
model = BertForSequenceClassification(config) 
print(model.parameters)
config

<bound method Module.parameters of BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL_27",
    "28": "LABEL_28",
    "29": "LABEL_29",
    "30": "LABEL_30",
    "31": "LABEL_31",
    "32": "LABEL_32",
    "33": "LABEL_33",
    "34": "LABE

In [2]:
# tokenize datasets
def tokenize_function(item):
    return tokenizer(item["text"], padding=True, truncation=True)
tokenized_datasets = (
    load_dataset("mdroth/transformers_issues_labels")
    .map(tokenize_function, batched=True)
    .remove_columns(column_names=["url", "text", "num_labels", "labels"])
    .rename_column("arr_labels", "labels")
)
print(tokenized_datasets)
# prepare batch from dev split
batch_size = 2
dev_samples = tokenized_datasets["dev"][:batch_size]
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
dev_batch = data_collator(dev_samples)       # use data_collator to turn samples into a batch
print("\ndatacollator returns samples as uniform batch:\n{}".format({k: v.shape for k, v in dev_batch.items()}))

Using custom data configuration mdroth--transformers_issues_labels-e1a55ed64424aafd
Reusing dataset parquet (/Users/matthias/.cache/huggingface/datasets/parquet/mdroth--transformers_issues_labels-e1a55ed64424aafd/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/4 [00:00<?, ?it/s]

Loading cached processed dataset at /Users/matthias/.cache/huggingface/datasets/parquet/mdroth--transformers_issues_labels-e1a55ed64424aafd/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-bc50ad711c8ff64c.arrow
Loading cached processed dataset at /Users/matthias/.cache/huggingface/datasets/parquet/mdroth--transformers_issues_labels-e1a55ed64424aafd/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-f7197109205d9c03.arrow
Loading cached processed dataset at /Users/matthias/.cache/huggingface/datasets/parquet/mdroth--transformers_issues_labels-e1a55ed64424aafd/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-7124570011edecb2.arrow
Loading cached processed dataset at /Users/matthias/.cache/huggingface/datasets/parquet/mdroth--transformers_issues_labels-e1a55ed64424aafd/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-f3a9b3f8613c3ab7.arrow


DatasetDict({
    valid: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 220
    })
    dev: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 277
    })
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 883
    })
})

datacollator returns samples as uniform batch:
{'labels': torch.Size([2, 57]), 'input_ids': torch.Size([2, 512]), 'token_type_ids': torch.Size([2, 512]), 'attention_mask': torch.Size([2, 512])}


In [3]:
#my_dict.pop('key', None)
labels_batch = dev_batch
labels_batch = labels_batch.pop("labels", None)
labels_batch

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [6]:
print(dev_batch.keys())
dev_batch

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


{'input_ids': tensor([[  101,   157, 12150,  ...,  1559,  2093,   102],
        [  101,   157, 12150,  ...,  9468,  1179,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [7]:
outputs = model(**dev_batch)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0400,  0.1878,  0.1274,  0.2317, -0.2159,  0.2350,  0.2424, -0.5218,
          0.0358, -0.2283,  0.0523, -0.2418, -0.3219, -0.1848,  0.0176,  0.1416,
         -0.3268, -0.3956,  0.1345, -0.1194,  0.4204, -0.3281,  0.0071,  0.0613,
         -0.3215,  0.5044, -0.1043, -0.1712,  0.3963,  0.0187,  0.3440, -0.0689,
          0.1384, -0.0714,  0.1173, -0.4938,  0.0983, -0.3728, -0.3036,  0.0592,
         -0.0559, -0.1301, -0.1116, -0.0315, -0.2888, -0.0652,  0.0112, -0.2715,
         -0.1207, -0.4523,  0.1587, -0.4962, -0.0997, -0.0793,  0.1053,  0.2931,
         -0.1170],
        [ 0.4054,  0.2027, -0.3325,  0.2702, -0.2085, -0.0591,  0.3589, -0.2228,
          0.1796, -0.1767, -0.1746, -0.1218, -0.2581,  0.1679, -0.0398,  0.0057,
         -0.1082, -0.4653,  0.3879, -0.3173,  0.4259, -0.0046,  0.0954,  0.0801,
         -0.5073,  0.3132,  0.3137,  0.0607,  0.2318, -0.0013, -0.0687, -0.1748,
          0.1861, -0.0196,  0.1730, -0.4786,  0

In [8]:
#from loss import BCEWithLogitsLoss
#BCEWithLogitsLoss
#import torch.nn.functional as F

In [9]:
# F.binary_cross_entropy_with_logits(outputs.logits, labels_batch)
#F.binary_cross_entropy_with_logits(outputs.logits, labels_batch.float())

In [10]:
#import torch
#input = torch.randn(3, requires_grad=True)
#print(f"input {input}")
#target = torch.empty(3).random_(2)
#print(f"target {target}")
#loss = F.binary_cross_entropy_with_logits(input, target)
#print(f"loss {loss}")
#loss.backward()

In [12]:
from torch.nn import BCEWithLogitsLoss
def loss_fn(logits, targets):
    return BCEWithLogitsLoss()(logits.float(), targets.float())

In [13]:
loss_fn(outputs.logits, labels_batch)

tensor(0.6797, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)

In [14]:
optimizer = AdamW(params=model.parameters(), lr=2e-5, weight_decay=1e-6)



In [52]:
#import torch
model.train()
# training
outputs = model(**dev_batch)
#print(f"outputs:\n{outputs}")
out_logits = outputs.logits
#labels_batch.type_as(out_logits)
loss = loss_fn(out_logits, labels_batch)
#loss = loss_fn(outputs.logits, labels_batch)
print(loss)
loss.backward()
optimizer.step()
optimizer.zero_grad()
#
model.eval()

tensor(0.3767, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
dev_set = tokenized_datasets["dev"]
print(dev_set)
for i in range(dev_set.num_rows):
    print(f"\n{i}\n{dev_set[i]['labels']}")

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

In [None]:
for k, v in zip(data_collator):
    break
print(k, v)