In [None]:
%pip install accelerate>=0.20.1 transformers datasets evaluate

In [1]:
from transformers import TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class KDTrainingArgs(TrainingArguments):
  def __init__(self, *args, alpha=1, temperature = 0.2, **kwargs):
    super().__init__(*args, **kwargs)

    self.alpha = alpha
    self.temperature = temperature


In [3]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import Trainer

In [4]:
class KDTrainer(Trainer):
  def __init__(self, *args, teacher_model = None, **kwargs):
    super().__init__(*args, **kwargs)

    self.teacher_model = teacher_model


  def loss(self, model, inputs, return_outputs = False):

    student_outputs = model(**inputs)
    ce_loss = student_outputs.loss
    student_logits = student_outputs.logits

    loss_fn = nn.KLDivLoss(reduction = 'batchmean')

    kd_loss = self.args.temperature**2*loss_fn(
        F.log_softmax(student_logits/self.args.temperature, dim = -1),
        F.softmax(self.teacher_model(**inputs).logits/self.args.temperature, dim = -1)
    )


    loss = self.args.alpha*ce_loss + (1. -self.args.alpha)*kd_loss

    return(loss, student_outputs) if return_outputs else loss

In [5]:
from datasets import load_dataset
ds = load_dataset("clinc/clinc_oos", 'plus')
temp = ds['train'][0]
print(temp)

{'text': 'what expression would i use to say i love you if i were an italian', 'intent': 61}


In [6]:
intents = ds['train'].features['intent']
intent = intents.int2str(temp['intent'])
print(intent)

translate


In [7]:
from transformers import AutoTokenizer

In [8]:
student_ckpt = 'distilbert-base-uncased'
student_tknzr = AutoTokenizer.from_pretrained(student_ckpt)

In [9]:
def tokenize(batch):
    return student_tknzr(batch['text'], truncation = True)

In [10]:
ds_tokenized = ds.map(tokenize, batched = True, remove_columns = 'text').rename_column('intent', 'labels')

Map: 100%|██████████| 5500/5500 [00:00<00:00, 27381.99 examples/s]


In [14]:
import numpy as np 
import evaluate
acc = evaluate.load('accuracy')

def comp_metric(preds):
    pred, labels = preds
    pred = np.argmax(pred, axis = 1)
    return acc.compute(predictions = pred, references = labels)

In [15]:
batchsize = 48
finetuned_student_ckpt = "distilbert-base-uncased-finetuned-clinc-student"

In [16]:
sttrain_args = KDTrainingArgs(
    output_dir = finetuned_student_ckpt, eval_strategy = 'epoch',
    num_train_epochs = 10, learning_rate = 2e-5,
    per_device_train_batch_size = batchsize,
    per_device_eval_batch_size = batchsize,
    alpha = 1, weight_decay = 0.01
)

In [17]:
from transformers import pipeline

bert_ckpt = "transformersbook/bert-base-uncased-finetuned-clinc"
pipe = pipeline("text-classification", model=bert_ckpt)

id2label = pipe.model.config.id2label
label2id = pipe.model.config.label2id

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cuda:0


In [18]:
from transformers import AutoConfig
num_labels = intents.num_classes
student_config = (AutoConfig.from_pretrained(student_ckpt, num_labels = num_labels, id2label = id2label, label2id = label2id))

In [19]:
import torch
from transformers import AutoModelForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def student_init():
  return (AutoModelForSequenceClassification.from_pretrained(student_ckpt, config=student_config).to(device))
     

In [20]:
teacher_ckpt = "transformersbook/bert-base-uncased-finetuned-clinc"

In [21]:
teacher_model = (AutoModelForSequenceClassification
                     .from_pretrained(teacher_ckpt, num_labels=num_labels)
                     .to(device))

In [22]:
distilbert_trainer = KDTrainer(model_init=student_init,
        teacher_model=teacher_model, args=sttrain_args,
        train_dataset=ds_tokenized['train'], eval_dataset=ds_tokenized['validation'],
        compute_metrics=comp_metric, tokenizer=student_tknzr)
distilbert_trainer.train()

  super().__init__(*args, **kwargs)
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferenc

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.218292,0.738387
2,3.740400,1.60641,0.865161
3,3.740400,0.802119,0.912581
4,1.373400,0.475949,0.930645
5,0.441200,0.344988,0.934516
6,0.441200,0.300421,0.940645
7,0.177700,0.266686,0.942581
8,0.091000,0.257944,0.943226
9,0.091000,0.255009,0.943548
10,0.060800,0.255079,0.944194


TrainOutput(global_step=3180, training_loss=0.9283622285854891, metrics={'train_runtime': 304.4594, 'train_samples_per_second': 500.888, 'train_steps_per_second': 10.445, 'total_flos': 827728372450224.0, 'train_loss': 0.9283622285854891, 'epoch': 10.0})

In [23]:
def save_teacher_model():
  teacher_model.save_pretrained("teacher_model")
def save_student_model():
  distilbert_trainer.save_model('student_model')

In [None]:
save_teacher_model()
save_student_model()

In [25]:

from transformers import AutoConfig, AutoModelForSequenceClassification
import os

def compute_parameters(model_path):
  model = AutoModelForSequenceClassification.from_pretrained(model_path)
  parameters = model.num_parameters()
  return parameters

In [29]:
tea_param = compute_parameters("teacher_model")
tea_param

109598359

In [30]:
stu_param = compute_parameters("student_model")
stu_param

67069591

## Percentage Reduction in Number of Parameters

In [31]:
(tea_param-stu_param)/tea_param * 100

38.804201438818986