##First we install Dependencies

In [13]:
!pip install -q transformers datasets


In [23]:
dataset_ckpt = 'ag_news'
teacher_model_ckpt = 'odunola/bert-base-cased-ag-news' #our already finetuned teacher model
student_model_ckpt = 'google/bert_uncased_L-6_H-256_A-4'

####Importing dependencies

In [24]:
from huggingface_hub import notebook_login
from datasets import load_dataset
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch
from transformers import AutoModelForSequenceClassification
from torch import nn
from torch import optim
from torch.nn import functional as F
from transformers import AutoTokenizer

In [25]:
student_tokenizer = AutoTokenizer.from_pretrained(student_model_ckpt)
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_ckpt)

Downloading (…)lve/main/config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [26]:
test_list = ['W have a hope for the unknown'] * 5


In [27]:
student_tokens = student_tokenizer(test_list, max_length = 50, truncation = True, padding = True)
teacher_tokens = teacher_tokenizer(test_list, max_length = 50, truncation = True, padding = True)

In [28]:
student_tokens

{'input_ids': [[101, 1059, 2031, 1037, 3246, 2005, 1996, 4242, 102], [101, 1059, 2031, 1037, 3246, 2005, 1996, 4242, 102], [101, 1059, 2031, 1037, 3246, 2005, 1996, 4242, 102], [101, 1059, 2031, 1037, 3246, 2005, 1996, 4242, 102], [101, 1059, 2031, 1037, 3246, 2005, 1996, 4242, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [29]:
teacher_tokens

{'input_ids': [[101, 160, 1138, 170, 2810, 1111, 1103, 3655, 102], [101, 160, 1138, 170, 2810, 1111, 1103, 3655, 102], [101, 160, 1138, 170, 2810, 1111, 1103, 3655, 102], [101, 160, 1138, 170, 2810, 1111, 1103, 3655, 102], [101, 160, 1138, 170, 2810, 1111, 1103, 3655, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]}

###Preprocessing Data
We would be using the ag_news dataset comprised of 120k training samples and 7600 test samples. for our validation set we extract 12k samples from the original train set



In [None]:
data = load_dataset(dataset_ckpt)

Downloading builder script:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [30]:
data['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 120000
})

In [31]:
train_valid_data = data['train']
test_data = data['test']
train_valid_data = train_valid_data.shuffle().train_test_split(test_size = 0.1)
train_data, valid_data = train_valid_data['train'], train_valid_data['test']

In [34]:
def get_num_rows(dataset):
  return dataset.num_rows

print(f'Train set has {get_num_rows(train_data)} texts')
print(f'Valid set has {get_num_rows(valid_data)} texts')
print(f'Test set has {get_num_rows(test_data)} texts')

Train set has 108000 texts
Valid set has 12000 texts
Test set has 7600 texts


####Now we pull our tokenizer from the huggingface hub. since both are BERT Models we should be able to use the same tokenizzer for both student and teacher

In [35]:
student_tokenizer = AutoTokenizer.from_pretrained(student_model_ckpt)
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_model_ckpt)

In [36]:
#now we would utilise pytorch's Dataset andDataloader classes to create our dataset

class MyData(Dataset):
  def __init__(self, data):
    targets = data['label']
    texts = data['text']

    student_tokens = student_tokenizer(texts, return_tensors = 'pt', truncation = True, padding = 'max_length', max_length = 150)
    teacher_tokens = teacher_tokenizer(texts, return_tensors = 'pt', truncation = True, padding = 'max_length', max_length = 150)
    self.input_ids_teacher = teacher_tokens['input_ids']
    self.attention_mask_teacher = teacher_tokens['attention_mask']
    self.input_ids_student = student_tokens['input_ids']
    self.attention_mask_student = student_tokens['attention_mask']
    self.targets = torch.tensor(targets)
    self.length = len(texts)
  def __len__(self):
    return self.length
  def __getitem__(self, index):
    return self.input_ids_teacher[index],self.attention_mask_teacher[index],self.input_ids_student[index],self.attention_mask_student[index], self.targets[index]

In [37]:
train_data = MyData(train_data)
valid_data = MyData(valid_data)
test_data = MyData(test_data)

####In Pytorch, dataloaders are iterators that make writing our training loops easier

In [38]:
# now we build our loaders
batch_size = 16
train_loader = DataLoader(train_data, shuffle = True, batch_size = batch_size)
valid_loader = DataLoader(valid_data, shuffle  = True, batch_size = batch_size)
test_loader = DataLoader(test_data, shuffle = True, batch_size = batch_size)

In [39]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


#### we define a function to help us compute accuracy as we train

In [40]:
from tqdm import tqdm
from time import perf_counter

In [44]:

# we define a function to help us compute accuracy as we train, we would also define another function to measure time ellapsed
def accuracy_score(batch, model):
  with torch.no_grad():
    outputs = model(
        batch[0].to(device),
        batch[1].to(device)
    )
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim = 1)
    class_predictions = torch.argmax(probabilities, dim = 1)
    acc = torch.mean((class_predictions == batch[4].to(device)).to(torch.float)).data.item()
    return acc


####Now let us test the accuracyof our already trained teacher model

In [42]:
teacher_model = AutoModelForSequenceClassification.from_pretrained(teacher_model_ckpt).to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/901 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

In [None]:
accuracy = 0.0
time_taken = 0.0
for batch in tqdm(test_loader):
  start_time = perf_counter()
  score = accuracy_score(batch, teacher_model)
  end_time = perf_counter()
  accuracy += score
  time_taken += end_time - start_time

print('\n\n')
print(f"number of samples in each batch is {len(batch[0])}")
print(f'number of batch is {len(test_loader)}')
print(f"accuracy is {accuracy / len(test_loader):.2f}")
print(f'time taken per batch is {time_taken / len(test_loader):.6f}')



 15%|█▍        | 70/475 [10:40<56:11,  8.32s/it]

####On a T5 GPU provided by colab we are able to do inference on each batch in .27 seconds. Let's see if we can match perfocnacena dn reduce inference time for the same test becnh


###We download our student model

In [None]:
student_model = AutoModelForSequenceClassification.from_pretrained(student_model_ckpt, num_labels = 4).to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/141M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-6_H-512_A-8 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
epochs = 3
learning_rate = 2e-5
entropy_loss = nn.CrossEntropyLoss()
temperature = 1.0
alpha = 0.1
criterion = nn.KLDivLoss(reduction = 'batchmean')
optimizer = optim.Adam(teacher_model.parameters(), lr = learning_rate)


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
for epoch in tqdm(range(epochs), total = epochs):
  #start training loader
  student_model.train()
  training_loss = 0.0
  training_kd_loss = 0.0
  training_accuracy = 0.0
  valid_loss = 0.0
  valid_accuracy = 0.0
  for batch in train_loader:
    optimizer.zero_grad()
    input_ids_teach = batch[0].to(device)
    attention_mask_teach = batch[1].to(device)
    input_ids_stud = batch[2].to(device)
    attention_mask_stud = batch[3].to(device)
    target_tensors = batch[4].to(device)
    output_stu = student_model(input_ids = input_ids_stud, attention_mask = attention_mask_stud).logits
    loss_ce = entropy_loss(output_stu, target_tensors).data.item()
    #now we extract logits from teacher too!
    with torch.no_grad():
      outputs_teach = teacher_model(input_ids = input_ids_teach, attention_mask = attention_mask_teach)
      outputs_teach_logits = outputs_teach.logits
    loss_kd = temperature ** 2 * criterion(
        F.log_softmax(output_stu /  temperature, dim = -1),
        F.softmax(outputs_teach_logits / temperature, dim = -1)
        )
    loss = alpha * loss_ce + (1. - alpha) * loss_kd
    loss.backward()
    optimizer.step()
    training_kd_loss += loss_kd.data.item()
    training_loss += loss
    training_accuracy += accuracy_score(batch, student_model)

  student_model.eval()
  for batch in valid_loader:
    input_ids_teach = batch[0].to(device)
    attention_mask_teach = batch[1].to(device)
    input_ids_stud = batch[2].to(device)
    attention_mask_stud = batch[3].to(device)
    target_tensors = batch[4].to(device)
    output = student_model(input_ids = input_ids_stud, attention_mask = attention_mask_stud)
    loss = entropy_loss(output.logits, target_tensors)
    valid_loss += loss.data.item()
    valid_accuracy += accuracy_score(b, student_model)

  training_accuracy /= len(train_loader)
  valid_accuracy /= len(valid_loader)
  training_loss /= len(train_loader)
  training_kd_loss /= len(train_loader)
  valid_loss /= len(valid_loader)
  #we would also test accuracy on validation
  print(f"""
    After epochs {epoch + 1},
    training loss (entropy) was {training_loss},
    Kullback-Leibler (KL) divergence loss was {training_kd_loss}
    validation_loss was {valid_loss}.
    training_accuracy {training_accuracy}
    valid_accuracy {valid_accuracy}
  """)




 33%|███▎      | 1/3 [1:42:03<3:24:07, 6123.65s/it]


    After epochs 1, 
    training loss (entropy) was 1.3893439892839503, 
    Kullback-Leibler (KL) divergence loss was 5.052333517710368
    validation_loss was 1.386915099143982. 
    training_accuracy 0.2638333333333333 
    valid_accuracy 0.99675
  


 33%|███▎      | 1/3 [1:49:38<3:39:17, 6578.52s/it]


KeyboardInterrupt: ignored

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
student_model.push_to_hub('odunola/student_distillation_model')

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/odunola/bert-yelp-review_full-test-set-only/commit/0c636376ca9508e2f235c9c3fbb144dd37a92fa8', commit_message='Upload BertForSequenceClassification', commit_description='', oid='0c636376ca9508e2f235c9c3fbb144dd37a92fa8', pr_url=None, pr_revision=None, pr_num=None)