In [2]:
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install transformers[torch]



In [3]:
import pandas as pd
import numpy as np
from sklearn import metrics
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import transformers
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from transformers import TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print("Using device:", device)

Using device: cuda


## Fine-Tuning BERT for multi-class text classification

In [5]:
from datasets import load_dataset

dataset = load_dataset("sem_eval_2018_task_1", "subtask5.english")

Downloading builder script: 100%|██████████| 6.29k/6.29k [00:00<00:00, 20.9MB/s]
Downloading metadata: 100%|██████████| 7.76k/7.76k [00:00<00:00, 27.4MB/s]
Downloading readme: 100%|██████████| 10.6k/10.6k [00:00<00:00, 16.0MB/s]
Downloading data: 100%|██████████| 5.98M/5.98M [00:00<00:00, 35.1MB/s]
Downloading data files: 100%|██████████| 1/1 [00:00<00:00,  2.83it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00,  7.05it/s]
Generating train split: 100%|██████████| 6838/6838 [00:00<00:00, 9580.11 examples/s]
Generating test split: 100%|██████████| 3259/3259 [00:00<00:00, 9579.84 examples/s]
Generating validation split: 100%|██████████| 886/886 [00:00<00:00, 7849.77 examples/s]


In [6]:
len(dataset['train'])

6838

In [7]:
# prepare the labels dataset for inference
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'Tweet']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'love',
 'optimism',
 'pessimism',
 'sadness',
 'surprise',
 'trust']

In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["Tweet"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

(…)cased/resolve/main/tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 113kB/s]
(…)rt-base-uncased/resolve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 2.57MB/s]
(…)bert-base-uncased/resolve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 10.9MB/s]
(…)base-uncased/resolve/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 41.0MB/s]


In [9]:
# preprocess data
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")

Map: 100%|██████████| 6838/6838 [00:00<00:00, 9585.90 examples/s] 
Map: 100%|██████████| 3259/3259 [00:00<00:00, 11458.13 examples/s]
Map: 100%|██████████| 886/886 [00:00<00:00, 11822.98 examples/s]


In [10]:
# define the model

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

model.safetensors: 100%|██████████| 440M/440M [00:01<00:00, 244MB/s]  
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not ini

In [11]:
batch_size = 8
metric_name = "f1"

In [12]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

In [13]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [14]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [15]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.4083,0.322057,0.66433,0.764223,0.264108
2,0.2839,0.309921,0.703094,0.797134,0.302483
3,0.2404,0.308732,0.70629,0.80021,0.281038
4,0.2163,0.30992,0.711462,0.803713,0.288939
5,0.1915,0.314015,0.709709,0.803054,0.28781


TrainOutput(global_step=4275, training_loss=0.2606135416309736, metrics={'train_runtime': 375.8152, 'train_samples_per_second': 90.976, 'train_steps_per_second': 11.375, 'total_flos': 2249123476753920.0, 'train_loss': 0.2606135416309736, 'epoch': 5.0})

In [16]:
trainer.evaluate()

{'eval_loss': 0.30991965532302856,
 'eval_f1': 0.7114624505928855,
 'eval_roc_auc': 0.8037130323402488,
 'eval_accuracy': 0.28893905191873587,
 'eval_runtime': 2.4577,
 'eval_samples_per_second': 360.505,
 'eval_steps_per_second': 45.165,
 'epoch': 5.0}

In [23]:
# save the fine-tuned model
trainer.save_model("bert-finetuned-sem_eval-english")

## Knowledge Distillation

In [24]:
import torch.nn.functional as F

class KnowledgeDistillationTrainingArguments(TrainingArguments):
  def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
    #*args allows us to pass a variable number of non-keyword arguments to a Python function.
    #**kwargs stands for keyword arguments. The only difference from args is that it uses keywords and returns the values in the form of a dictionary.
    super().__init__(*args, **kwargs)
    #The super() function is often used with the __init__() method to initialize the attributes of the parent class.
    self.alpha = alpha
    self.temperature = temperature

class KnowledgeDistillationTrainer(Trainer):
  def __init__(self, *args, teacher_model=None, **kwargs):
    super().__init__(*args, **kwargs)
    self.teacher_model = teacher_model

  def compute_loss(self, model, inputs, return_outputs=False):
    #Extract cross-entropy loss and logits from student
    outputs_student = model(**inputs)
    loss_ce = outputs_student.loss
    logits_student = outputs_student.logits

    # Extract logits from teacher
    outputs_teacher = self.teacher_model(**inputs)
    logits_teacher = outputs_teacher.logits

     #Computing distillation loss by Softening probabilities
    loss_fct = torch.nn.KLDivLoss(reduction="batchmean")
    #The reduction=batchmean argument in nn.KLDivLoss() specifies that we average the losses over the batch dimension.
    loss_kd = self.args.temperature ** 2 * loss_fct(
                F.log_softmax(logits_student / self.args.temperature, dim=-1),
                F.softmax(logits_teacher / self.args.temperature, dim=-1))

    # Return weighted student loss
    loss = self.args.alpha * loss_ce + (1. - self.args.alpha) * loss_kd
    return (loss, outputs_student) if return_outputs else loss

In [25]:
student_model = "bert-base-uncased"

student_tokenizer = AutoTokenizer.from_pretrained(student_model)

def preprocess_data_student(examples):
  # take a batch of texts
  text = examples["Tweet"]
  # encode them
  encoding = student_tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

# preprocess data
encoded_dataset = dataset.map(preprocess_data_student, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")

Map: 100%|██████████| 3259/3259 [00:00<00:00, 12034.87 examples/s]


In [26]:
student_training_args = KnowledgeDistillationTrainingArguments(
    output_dir = "./",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
    )

In [27]:
student_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

student_model = student_model.to(device)

# teacher_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
#                                                            problem_type="multi_label_classification",
#                                                            num_labels=len(labels))

# Use the fine-tuned model as the teacher
teacher_model = AutoModelForSequenceClassification.from_pretrained("bert-finetuned-sem_eval-english",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

teacher_model = teacher_model.to(device)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [28]:
distilled_model_training = KnowledgeDistillationTrainer(student_model, teacher_model= teacher_model,
                                                        args = student_training_args, train_dataset=encoded_dataset["train"],
                                                        eval_dataset=encoded_dataset["validation"],compute_metrics=compute_metrics, tokenizer = student_tokenizer)

In [29]:
distilled_model_training.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,1.1178,0.336634,0.697838,0.80338,0.247178
2,0.4031,0.261402,0.702599,0.80254,0.267494
3,0.2532,0.20751,0.712202,0.806795,0.278781
4,0.2033,0.201182,0.708992,0.802227,0.274266
5,0.1623,0.194787,0.710957,0.802852,0.276524


TrainOutput(global_step=4275, training_loss=0.38770726343344525, metrics={'train_runtime': 644.5748, 'train_samples_per_second': 53.043, 'train_steps_per_second': 6.632, 'total_flos': 2249123476753920.0, 'train_loss': 0.38770726343344525, 'epoch': 5.0})

In [None]:
# save the distilled model
distilled_model_training.save_model("bert-distilled-sem_eval-english")