## Read and tokenize data

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

#raw_datasets = load_dataset("glue", "mrpc")
raw_datasets = load_dataset('csv', data_files={'train': ['train.csv'], 'eval': 'test.csv'})


# checkpoint = "bert-base-uncased"
checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Using custom data configuration default-d8f4c168f3d8923e
Reusing dataset csv (/users/phd/kpawan/.cache/huggingface/datasets/csv/default-d8f4c168f3d8923e/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /users/phd/kpawan/.cache/huggingface/datasets/csv/default-d8f4c168f3d8923e/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-d1f6995c61059c99.arrow
Loading cached processed dataset at /users/phd/kpawan/.cache/huggingface/datasets/csv/default-d8f4c168f3d8923e/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-b325f37cdfa3cd0f.arrow


In [2]:
tokenized_datasets["train"].column_names

['attention_mask', 'input_ids', 'label', 'text']

In [3]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names


['attention_mask', 'input_ids', 'labels']

In [4]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=32, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["eval"], batch_size=8, collate_fn=data_collator
)


## Test batches

In [5]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'attention_mask': torch.Size([32, 47]),
 'input_ids': torch.Size([32, 47]),
 'labels': torch.Size([32])}

In [6]:
import torch
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
from tqdm import tqdm

## compute_metric function for evaluation

In [7]:
import numpy as np
from datasets import load_metric

def compute_metrics(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)



## Inherit Trainer and Override compute_loss to change the loss function

In [2]:
from transformers import Trainer, TrainingArguments
from torch import nn

class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        cuda0 = torch.device('cuda:0')
        pos_weight = torch.ones([self.model.config.num_labels], device=cuda0)
#         pos_weight[16] = 2.0
#         loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        loss_fct = nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                        torch.nn.functional.one_hot(labels, num_classes=self.model.config.num_labels).float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [9]:
import os
os.environ["WANDB_DISABLED"] = "true"

## Train the model

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=19)

training_args = TrainingArguments(
    "test-trainer",
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
)


from transformers import Trainer

# trainer = MultilabelTrainer(
#     model,
#     training_args,
#     train_dataset=tokenized_datasets["train"],
#     eval_dataset=tokenized_datasets["eval"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


trainer.train()


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.94705,0.744203
2,No log,0.61703,0.826647
3,No log,0.544373,0.842841
4,No log,0.558868,0.845418
5,No log,0.559438,0.844682


***** Running Evaluation *****
  Num examples = 2717
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2717
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2717
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2717
  Batch size = 128
***** Running Evaluation *****
  Num examples = 2717
  Batch size = 128


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=315, training_loss=0.7400496709914435, metrics={'train_runtime': 455.9329, 'train_samples_per_second': 87.732, 'train_steps_per_second': 0.691, 'total_flos': 1623083804017536.0, 'train_loss': 0.7400496709914435, 'epoch': 5.0})

## Get model predictions

In [11]:
predictions = trainer.predict(tokenized_datasets["eval"])
print(predictions.predictions.shape, predictions.label_ids.shape)


***** Running Prediction *****
  Num examples = 2717
  Batch size = 128


(2717, 19) (2717,)


## Compute different metrics

In [12]:
preds = np.argmax(predictions.predictions, axis=-1)
metric = load_metric("accuracy","")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.8446816341553184}

In [13]:
from sklearn.metrics import f1_score, confusion_matrix
print(f1_score(predictions.label_ids, preds, average='micro'))
confusion_matrix(predictions.label_ids, preds, )

0.8446816341553184


array([[ 46,   0,   0,   0,   1,   0,   0,   0,   0,   1,   0,   1,   0,
          0,   0,   1,   1,   0,   0],
       [  0,  42,   0,   0,   0,   0,   0,   0,   0,   1,   0,   0,   0,
          0,   0,   1,   2,   1,   0],
       [  0,   0, 112,   1,   3,   0,   0,   0,   0,   0,   0,   0,   0,
          1,   0,   1,  16,   0,   0],
       [  0,   0,   0, 181,   0,   3,   0,   0,   0,   1,   0,   0,   0,
          0,   0,   0,   9,   0,   0],
       [  0,   0,   5,   0, 126,   0,   3,   0,   0,   0,   0,   1,   0,
          4,   4,   1,   6,   0,   0],
       [  0,   0,   0,   4,   0,  91,   0,   2,   0,   5,   0,   0,   0,
          0,   0,   2,   4,   0,   0],
       [  0,   1,   0,   0,   1,   0,  36,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   1,   0,   0],
       [  0,   0,   0,   0,   0,   2,   0,  15,   0,   0,   0,   0,   0,
          0,   0,   0,   5,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   1,   0