In [None]:
!pip install transformers

In [None]:
from transformers import TrainingArguments

In [None]:
class KnowledgeDistillationTrainingArguments(TrainingArguments):
  def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
    #The new hyperparameters α and T α - control the relative weight of the distillation loss T - how much the probability distribution of the labels should be smoothed
    super().__init__(*args, **kwargs)
    self.alpha = alpha
    self.temperature = temperature


 **Coding the Loss Function**

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import Trainer

In [None]:
class KnowledgeDistillationTrainer(Trainer):
  def __init__(self, *args, teacher_model=None, **kwargs):
    super().__init__(*args, **kwargs)
    self.teacher_model = teacher_model

  def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
    #Extract cross-entropy loss and logits from student
    outputs_student = model(**inputs)
    loss_ce = outputs_student.loss
    logits_student = outputs_student.logits

    # Extract logits from teacher
    outputs_teacher = self.teacher_model(**inputs)
    logits_teacher = outputs_teacher.logits

     #Computing distillation loss by Softening probabilities
    loss_fct = nn.KLDivLoss(reduction="batchmean")
    #The reduction=batchmean argument in nn.KLDivLoss() specifies that we average the losses over the batch dimension.
    loss_kd = self.args.temperature ** 2 * loss_fct(
                F.log_softmax(logits_student / self.args.temperature, dim=-1),
                F.softmax(logits_teacher / self.args.temperature, dim=-1))

    # Return weighted student loss
    loss = self.args.alpha * loss_ce + (1. - self.args.alpha) * loss_kd
    return (loss, outputs_student) if return_outputs else loss


**Loading The datasets**

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

In [None]:
#Loading the CLINC150

clinc = load_dataset("clinc_oos", "plus")
#the plus configuration refers to the subset that contains the out-of-scope training examples.


In [None]:
sample = clinc["train"][0]
print(sample)
#Each example in the CLINC150 dataset consists of a query in the text column and its corresponding intent.

In [None]:
intents = clinc["train"].features["intent"]
intent = intents.int2str(sample["intent"])
print(intent)

# **Tokenizing the dataset**

In [None]:
from transformers import AutoTokenizer

In [None]:
student_checkpoint = "distilbert-base-uncased"
student_tokenizer = AutoTokenizer.from_pretrained(student_checkpoint)

In [None]:
def tokenize_text(batch):
  return student_tokenizer(batch["text"], truncation=True)


In [None]:
clinc_tokenized = clinc.map(tokenize_text, batched=True, remove_columns=["text"])

#We will remove text column as we don't need it
#We will also rename the intent column to labels so it can be automatically detected by the trainer.
clinc_tokenized = clinc_tokenized.rename_column("intent", "labels")


# **Defining metrics for Distillation Trainer**

In [None]:
!pip install evaluate

In [None]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [None]:
!pip install transformers[torch]

In [None]:
batch_size = 48
finetuned_student_ckpt = "distilbert-base-uncased-finetuned-clinc-student"

In [None]:
!pip install accelerate>=0.20.1

In [None]:
student_training_args = KnowledgeDistillationTrainingArguments(
    output_dir=finetuned_student_ckpt, evaluation_strategy = "epoch",
    num_train_epochs=5, learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, alpha=1, weight_decay=0.01)

# **Initializing Student Model & providing the student model with the mappings between each intent and label ID**

In [None]:
from transformers import pipeline

bert_ckpt = "transformersbook/bert-base-uncased-finetuned-clinc"
pipe = pipeline("text-classification", model=bert_ckpt)

id2label = pipe.model.config.id2label
label2id = pipe.model.config.label2id

In [None]:
from transformers import AutoConfig
num_labels = intents.num_classes
student_config = (AutoConfig
                  .from_pretrained(student_checkpoint, num_labels=num_labels,
                                    id2label=id2label, label2id=label2id))


In [None]:
import torch
from transformers import AutoModelForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def student_init():
  return (AutoModelForSequenceClassification.from_pretrained(student_checkpoint, config=student_config).to(device))

# **Loading teacher Checkpoint & Starting Fine-Tuning**

In [None]:
teacher_checkpoint = "transformersbook/bert-base-uncased-finetuned-clinc"

In [None]:
teacher_model = (AutoModelForSequenceClassification
                     .from_pretrained(teacher_checkpoint, num_labels=num_labels)
                     .to(device))


In [None]:
#Starting the training process
distilbert_trainer = KnowledgeDistillationTrainer(model_init=student_init,
        teacher_model=teacher_model, args=student_training_args,
        train_dataset=clinc_tokenized['train'], eval_dataset=clinc_tokenized['validation'],
        compute_metrics=compute_metrics, tokenizer=student_tokenizer)
distilbert_trainer.train()

In [None]:
def save_teacher_model():
  teacher_model.save_pretrained("teacher_model")
def save_student_model():
  distilbert_trainer.save_model('student_model')


In [None]:
save_teacher_model()
save_student_model()

# **Comparing the two models based on No. Of Params**

In [None]:
from transformers import AutoConfig, AutoModelForSequenceClassification
import os

def compute_parameters(model_path):
  model = AutoModelForSequenceClassification.from_pretrained(model_path)
  parameters = model.num_parameters()
  return parameters


In [None]:
teacher_model_parameters = compute_parameters(model_path="/content/teacher_model")
print("Teacher Model: ", teacher_model_parameters)


In [None]:
student_model_parameters = compute_parameters(model_path="/content/student_model")
print("Student Model: ", student_model_parameters)

In [None]:
!ls /content/student_model -al --block-size=MB

In [None]:
!ls /content/teacher_model -al --block-size=MB

In [None]:
decrease = (student_model_parameters-teacher_model_parameters)/teacher_model_parameters
print(decrease*100)

# **Comparing Accuracies of Teacher & Student Model**

In [None]:
import numpy as np
import torch
import evaluate
from torch.utils.data import DataLoader
from tqdm import tqdm

from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=student_tokenizer, return_tensors="pt")


accuracy_metric = evaluate.load("accuracy")

def compute_model_accuracy(model, dataset, tokenizer, batch_size=32):
    model.eval()
    model.to("cuda" if torch.cuda.is_available() else "cpu")

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)

    all_preds = []
    all_labels = []

    for batch in tqdm(dataloader, desc="Evaluating", leave=False):
        labels = batch["labels"].to(model.device)
        inputs = {k: v.to(model.device) for k, v in batch.items() if k != "labels"}

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    return accuracy_metric.compute(predictions=all_preds, references=all_labels)


In [None]:
from transformers import AutoModelForSequenceClassification

student_model = AutoModelForSequenceClassification.from_pretrained("/content/student_model")


In [None]:
teacher_acc = compute_model_accuracy(teacher_model, clinc_tokenized["validation"], tokenizer=student_tokenizer)
student_acc = compute_model_accuracy(student_model, clinc_tokenized["validation"], tokenizer=student_tokenizer)

print(f"Teacher Accuracy: {teacher_acc['accuracy']*100:.2f}%")
print(f"Student Accuracy: {student_acc['accuracy']*100:.2f}%")


# **Comparing Inference times of both Models**

In [None]:
#Lets warmup first
from transformers import pipeline
import time

pipe = pipeline("text-classification", model="/content/teacher_model", tokenizer='bert-base-uncased')

sample_input = clinc['train']['text'][101]

#WARMUP
for _ in range(10):
  _ = pipe(sample_input)

start = time.time()
for _ in range(100):
  _ = pipe(sample_input)
total_time_teacher_model = time.time()-start
print("Total time to process 100 requests for Teacher Model: ",total_time_teacher_model)

In [None]:
pipe = pipeline("text-classification", model="/content/student_model", tokenizer="distilbert-base-uncased")

sample_input = clinc['train']['text'][101]

#WARMUP
for _ in range(10):
  _ = pipe(sample_input)

start = time.time()
for _ in range(100):
  _ = pipe(sample_input)
total_time_student_model = time.time()-start

print("Total time to process 100 requests for Student Model: ",total_time_student_model)

In [None]:
decrease_in_time = (total_time_teacher_model-total_time_student_model)/total_time_teacher_model
print(decrease_in_time*100)
