## Knowledge Distillation (Training Teacher and Student Model)

A classic example of model compression can be seen in various BERT models that employ knowledge distillation to compress their large deep models into lightweight versions of BERT. 

Knowledge_Distillation_Training

In this project, DistilBERT is a natural candidate to initialize the student with since it has 40% fewer parameters and has been shown to achieve strong results on downstream tasks.  Smaller model than teacher for the student to reduce the latency and memory footprint. Knowledge distillation functions best when the teacher and learner are of the same model type. (BERT and RoBERTa, can have different output embedding spaces which creates issues for student to mimic the teacher).

### Environment Setup

In [1]:
#!pip install transformers
#!pip install datasets
#!pip install transformers[torch]
#!pip install accelerate>=0.20.1

In [3]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import Trainer,TrainingArguments, pipeline, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from datasets import load_dataset, load_metric
import time

### Prepare Dataset

In [17]:
#The CLINC150 dataset consists of a query in the text column and its corresponding intent
clinc = load_dataset("clinc_oos", "plus")
sample = clinc["train"][0]
print("question:",sample)
intents = clinc["train"].features["intent"]
intent = intents.int2str(sample["intent"])
print("intent  :",intent)

num_labels = intents.num_classes
print("labels  :",num_labels)

question: {'text': 'what expression would i use to say i love you if i were an italian', 'intent': 61}
intent  : translate
labels  : 151


#### Tokenizer

In [7]:
student_checkpoint = "distilbert-base-uncased"
student_tokenizer = AutoTokenizer.from_pretrained(student_checkpoint)

In [8]:
def tokenize_text(batch):
  return student_tokenizer(batch["text"], truncation=True)

In [9]:
clinc_tokenized = clinc.map(tokenize_text, batched=True, remove_columns=["text"])
clinc_tokenized = clinc_tokenized.rename_column("intent", "labels")

Map:   0%|          | 0/3100 [00:00<?, ? examples/s]

### Prepare training
Create trainer class and loss function compute_loss

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
class KnowledgeDistillationTrainingArguments(TrainingArguments):
  def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
    super().__init__(*args, **kwargs)
    self.alpha = alpha
    self.temperature = temperature

In [12]:
class KnowledgeDistillationTrainer(Trainer):
  def __init__(self, *args, teacher_model=None, **kwargs):
    super().__init__(*args, **kwargs)
    self.teacher_model = teacher_model

  def compute_loss(self, model, inputs, return_outputs=False):
    #Extract cross-entropy loss and logits from student
    outputs_student = model(**inputs)
    loss_ce = outputs_student.loss
    logits_student = outputs_student.logits
    # Extract logits from teacher
    outputs_teacher = self.teacher_model(**inputs)
    logits_teacher = outputs_teacher.logits
     #Computing distillation loss by Softening probabilities
    loss_fct = nn.KLDivLoss(reduction="batchmean")
    loss_kd = self.args.temperature ** 2 * loss_fct(
                F.log_softmax(logits_student / self.args.temperature, dim=-1),
                F.softmax(logits_teacher / self.args.temperature, dim=-1))

    loss = self.args.alpha * loss_ce + (1. - self.args.alpha) * loss_kd
    return (loss, outputs_student) if return_outputs else loss


In [13]:
accuracy_score = load_metric("accuracy")

def compute_metrics(pred):
  predictions, labels = pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy_score.compute(predictions=predictions, references=labels)

  accuracy_score = load_metric("accuracy")


In this function, the predictions from the sequence modeling head come in the form of logits, so we use the np.argmax() function to find the most confident class predic‐ tion and compare that against the ground truth label. 

In [14]:
batch_size = 48
finetuned_student_ckpt = "distilbert-base-uncased-finetuned-clinc-student"

## Training Arguments for DistillationTrainer
student_training_args = KnowledgeDistillationTrainingArguments(
    output_dir=finetuned_student_ckpt, 
    evaluation_strategy = "epoch",
    num_train_epochs=3, 
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, 
    alpha=1, 
    weight_decay=0.01)

### Teacher model

In [18]:
teacher_checkpoint = "transformersbook/bert-base-uncased-finetuned-clinc"
teacher_model = (AutoModelForSequenceClassification.from_pretrained(teacher_checkpoint, 
                                                                    num_labels=num_labels).to(device))

### Student model 

In [19]:
bert_ckpt = "transformersbook/bert-base-uncased-finetuned-clinc"
pipe = pipeline("text-classification", model=bert_ckpt)

## mappings between each intent and label ID.
id2label = pipe.model.config.id2label
label2id = pipe.model.config.label2id

student_config = (AutoConfig.from_pretrained(student_checkpoint, 
                                             num_labels=num_labels,
                                             id2label=id2label, 
                                             label2id=label2id))

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def student_init():
  return (AutoModelForSequenceClassification.from_pretrained(student_checkpoint, 
                                                             config=student_config).to(device))

### Run Training

In [20]:
%%time
distilbert_trainer = KnowledgeDistillationTrainer(
    model_init=student_init,
    teacher_model=teacher_model, 
    args=student_training_args,
    train_dataset=clinc_tokenized['train'], 
    eval_dataset=clinc_tokenized['validation'],
    compute_metrics=compute_metrics, 
    tokenizer=student_tokenizer)

distilbert_trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[codecarbon INFO @ 05:12:03] [setup] RAM Tracking...
[codecarbon INFO @ 05:12:03] [setup] GPU Tracking...
[codecarbon INFO @ 05:12:03] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 05:12:03] [setup] CPU Tracking...
[codecarbon INFO @ 05:12:04] CPU Model on constant consumption mode: AMD Ryzen 7 5700G with Radeon Graphics
[codecarbon INFO @ 05:12:04] >>> Tracker's metadata:
[codecarbon INFO @ 05:12:04]   Platform system: Linux-6.4.6-76060406-generic-x86_64-with-glibc2.35
[codecarbon INFO @ 05:12:04]   Python version: 3.10.12
[codecarbon INFO @ 05:12:04]   CodeCarbon version: 2.2.3
[codecarbon INFO @ 05:12:04]   Available RAM : 93.640 GB

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.420913,0.707742
2,3.894200,2.334888,0.816452
3,3.894200,2.009495,0.834194


[codecarbon INFO @ 05:12:28] Energy consumed for RAM : 0.000146 kWh. RAM Power : 35.114836692810066 W
[codecarbon INFO @ 05:12:28] Energy consumed for all GPUs : 0.001390 kWh. Total GPU Power : 333.46700000000004 W
[codecarbon INFO @ 05:12:28] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 05:12:28] 0.001713 kWh of electricity used since the beginning.
[codecarbon INFO @ 05:12:43] Energy consumed for RAM : 0.000293 kWh. RAM Power : 35.114836692810066 W
[codecarbon INFO @ 05:12:43] Energy consumed for all GPUs : 0.002773 kWh. Total GPU Power : 332.04900000000004 W
[codecarbon INFO @ 05:12:43] Energy consumed for all CPUs : 0.000354 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 05:12:43] 0.003420 kWh of electricity used since the beginning.
[codecarbon INFO @ 05:12:58] Energy consumed for RAM : 0.000439 kWh. RAM Power : 35.114836692810066 W
[codecarbon INFO @ 05:12:58] Energy consumed for all GPUs : 0.004155 kWh. Total GPU Power : 331.87500000

CPU times: user 1min 17s, sys: 1.81 s, total: 1min 19s
Wall time: 1min 25s


TrainOutput(global_step=954, training_loss=3.1499722078911163, metrics={'train_runtime': 79.4697, 'train_samples_per_second': 575.691, 'train_steps_per_second': 12.005, 'total_flos': 247836315084876.0, 'train_loss': 3.1499722078911163, 'epoch': 3.0})

#### save training result model

In [27]:
teacher_model_id_or_path="./result/teacher_model"
student_model_id_or_path="./result/student_model"

teacher_model.save_pretrained(teacher_model_id_or_path)
distilbert_trainer.save_model(student_model_id_or_path)

#### saved training log
TrainOutput(global_step=954, training_loss=3.1499722078911163, metrics={'train_runtime': 79.4697, 'train_samples_per_second': 575.691, 'train_steps_per_second': 12.005, 'total_flos': 247836315084876.0, 'train_loss': 3.1499722078911163, 'epoch': 3.0})
```
Epoch Training Loss 	Validation Loss 	Accuracy
1 	   No log           3.420913 	        0.707742
2 	   3.894200 	    2.334888 	        0.816452
3 	   3.894200 	    2.009495 	        0.834194
```

### Verify Teacher and Student Model
compare the two models based on size and inference time

In [22]:
def compute_parameters(model_path):
  model = AutoModelForSequenceClassification.from_pretrained(model_path)
  parameters = model.num_parameters()
  return parameters

### saving in model size

In [36]:
teacher_model_parameters = compute_parameters(model_path=teacher_model_id_or_path)
print("Teacher Model: ", teacher_model_parameters)

student_model_parameters = compute_parameters(model_path=student_model_id_or_path)
print("Student Model: ", student_model_parameters)

decrease = (student_model_parameters-teacher_model_parameters)/teacher_model_parameters
print("difference in parameters:",decrease*100)

Teacher Model:  109598359
Student Model:  67069591
difference in parameters: -38.804201438818986


In [68]:
## file size reduction 
!echo 'Teacher Model File Size'
!ls ./result/teacher_model -al --block-size=MB

Teacher Model File Size


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


total 439MB
drwxrwxr-x 2 pop pop   1MB Oct  9 05:17 .
drwxrwxr-x 4 pop pop   1MB Oct  9 05:17 ..
-rw-rw-r-- 1 pop pop   1MB Oct  9 05:18 config.json
-rw-rw-r-- 1 pop pop 439MB Oct  9 05:18 pytorch_model.bin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [69]:
!echo 'Student Model File Size'
!ls ./result/student_model -al --block-size=MB

Student Model File Size


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


total 270MB
drwxrwxr-x 2 pop pop   1MB Oct  9 05:17 .
drwxrwxr-x 4 pop pop   1MB Oct  9 05:17 ..
-rw-rw-r-- 1 pop pop   1MB Oct  9 05:18 added_tokens.json
-rw-rw-r-- 1 pop pop   1MB Oct  9 05:18 config.json
-rw-rw-r-- 1 pop pop 269MB Oct  9 05:18 pytorch_model.bin
-rw-rw-r-- 1 pop pop   1MB Oct  9 05:18 special_tokens_map.json
-rw-rw-r-- 1 pop pop   1MB Oct  9 05:18 tokenizer_config.json
-rw-rw-r-- 1 pop pop   1MB Oct  9 05:18 tokenizer.json
-rw-rw-r-- 1 pop pop   1MB Oct  9 05:18 training_args.bin
-rw-rw-r-- 1 pop pop   1MB Oct  9 05:18 vocab.txt


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [71]:
## spot check a sample
print(clinc['train']['text'][11])
print(clinc['train']['intent'][11])

i would like to know the proper way to greet an adult in portuguese
61


### Saving in inference performance

In [72]:
def performance_test(model_id_or_path,model_type,tokenizer_id):
    print("performance_test: ",model_id_or_path)
    pipe = pipeline("text-classification", model=model_id_or_path, tokenizer=tokenizer_id)
    sample_input = clinc['train']['text'][11]
    for _ in range(10):
        _ = pipe(sample_input)
    ## run test
    start = time.time()
    for _ in range(100):
        _ = pipe(sample_input)
    total_time = time.time()-start
    print(F"Total time to process 100 requests for {model_type}: ",total_time)
    return total_time

In [73]:
# teacher model test
teacher_total_time = performance_test(teacher_model_id_or_path,model_type="Teacher Model",tokenizer_id='bert-base-uncased')

# student model test
student_total_time = performance_test(student_model_id_or_path,model_type="Student Model",  tokenizer_id="distilbert-base-uncased")

# compute saving
changes_in_time = (teacher_total_time-student_total_time)/teacher_total_time
print("saving in inference time:",changes_in_time*100, "%")

performance_test:  ./result/teacher_model
Total time to process 100 requests for Teacher Model:  3.7654707431793213
performance_test:  ./result/student_model
Total time to process 100 requests for Student Model:  1.9501032829284668
saving in inference time: 48.2109033389561 %
