##First we install Dependencies

In [1]:
!pip install -q transformers datasets


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
dataset_ckpt = 'ag_news'
teacher_model_ckpt = 'odunola/bert-base-uncased-ag-news-finetuned-2' #our already finetuned teacher model
student_model_ckpt = 'distilbert-base-uncased'

####Importing dependencies

In [3]:
from huggingface_hub import notebook_login
from datasets import load_dataset
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch
from transformers import AutoModelForSequenceClassification
from torch import nn
from torch import optim
import pandas as pd
from torch.nn import functional as F
from transformers import AutoTokenizer

###Preprocessing Data
We would be using the ag_news dataset comprised of 120k training samples and 7600 test samples. for our validation set we extract 12k samples from the original train set



In [4]:
data = load_dataset(dataset_ckpt)

Downloading builder script:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [5]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [5]:
train_test = data['train'].train_test_split(test_size = 0.2)
valid_data = train_test['test']
train_data = train_test['train']
test_data = data['test']

In [6]:
def get_num_rows(dataset):
  return dataset.num_rows

print(f'Train set has {get_num_rows(train_data)} texts')
print(f'Valid set has {get_num_rows(valid_data)} texts')
print(f'Test set has {get_num_rows(test_data)} texts')

Train set has 96000 texts
Valid set has 24000 texts
Test set has 7600 texts


####Now we pull our tokenizer from the huggingface hub. since both are BERT Models we should be able to use the same tokenizzer for both student and teacher

In [7]:
tokenizer = AutoTokenizer.from_pretrained(teacher_model_ckpt)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

In [8]:
#now we would utilise pytorch's Dataset andDataloader classes to create our dataset

class MyData(Dataset):
  def __init__(self, data):
    targets = data['label']
    texts = data['text']

    tokens = tokenizer(texts, return_tensors = 'pt', truncation = True, padding = 'max_length', max_length = 128)
    self.input_ids = tokens['input_ids']
    self.attention_mask = tokens['attention_mask']
    self.targets = torch.tensor(targets)
    self.length = len(texts)
  def __len__(self):
    return self.length
  def __getitem__(self, index):
    return self.input_ids[index], self.attention_mask[index], self.targets[index]

In [9]:
train_data = MyData(train_data)
valid_data = MyData(valid_data)
test_data = MyData(test_data)

####In Pytorch, dataloaders are iterators that make writing our training loops easier

In [10]:
# now we build our loaders
batch_size = 64
train_loader = DataLoader(train_data,batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_data, batch_size = batch_size)
test_loader = DataLoader(test_data, batch_size = batch_size)

In [11]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


#### we define a function to help us compute accuracy as we train

In [12]:
from tqdm import tqdm
from time import perf_counter

In [13]:

# we define a function to help us compute accuracy as we train, we would also define another function to measure time ellapsed
def accuracy_score(batch, model):
  with torch.no_grad():
    outputs = model(
        batch[0].to(device),
        batch[1].to(device)
    )
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim = 1)
    class_predictions = torch.argmax(probabilities, dim = 1)
    acc = torch.mean((class_predictions == batch[2].to(device)).to(torch.float)).data.item()
    return acc


####Now let us test the accuracyof our already trained teacher model

In [14]:
teacher_model = AutoModelForSequenceClassification.from_pretrained(teacher_model_ckpt).to(device)
student_model = AutoModelForSequenceClassification.from_pretrained(student_model_ckpt, num_labels=4).to(device)
student_model.dropout = nn.Dropout(0.5)

Downloading (…)lve/main/config.json:   0%|          | 0.00/944 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
accuracy = 0.0
time_taken = 0.0
count = 0
for batch in tqdm(test_loader):
  start_time = perf_counter()
  score = accuracy_score(batch, teacher_model)
  end_time = perf_counter()
  accuracy += score
  time_taken += end_time - start_time

print('\n\n')
print(f"number of samples in each batch is {len(batch[0])}")
print(f'number of batch is {len(test_loader)}')
print(f"accuracy is {accuracy / len(test_loader):.2f}")
print(f'time taken per batch is {time_taken / len(test_loader):.6f}')



100%|██████████| 119/119 [00:55<00:00,  2.14it/s]




number of samples in each batch is 48
number of batch is 119
accuracy is 0.94
time taken per batch is 0.463092





####On a T5 GPU provided by colab we are able to do inference on each batch in .27 seconds. Let's see if we can match perfocnacena dn reduce inference time for the same test becnh


###We download our student model

In [21]:
for batch in test_loader:
  output = student_model(batch[0].to(device), batch[1].to(device))

In [16]:
def get_parameter_count(model):
  num_params = sum(p.numel() for p in model.parameters())
  return num_params

print(f'teacher model has {(get_parameter_count(teacher_model)/1000000):.2f} parameters')
print(f'student model has {(get_parameter_count(student_model)/1000000):.2f} parameters')

teacher model has 109.49 parameters
student model has 66.96 parameters


In [17]:
from torch.optim.lr_scheduler import LambdaLR





In [17]:
epochs = 5
learning_rate = 1e-3
lr_lambda  = lambda epoch: 1- (epoch / epochs)
entropy_loss = nn.CrossEntropyLoss()
temperature = 1.0
alpha = 0.5 #test this
criterion = nn.KLDivLoss(reduction = 'batchmean')
optimizer = optim.AdamW(student_model.parameters(), lr = learning_rate)
#scheduler = LambdaLR(optimizer, lr_lambda = lr_lambda)



In [None]:


# Lists to store training and validation metrics
training_loss_list = []
training_kd_loss_list = []
ce_loss_list = []
training_accuracy_list = []
valid_loss_list = []
valid_accuracy_list = []

#starting loop
for epoch in tqdm(range(epochs), total=epochs):
    student_model.train()

    #scheduler.step()
    train_loss = 0.0
    ce_losses = 0.0
    train_kd_loss = 0.0
    train_accuracy = 0.0
    valid_loss = 0.0
    valid_accuracy = 0.0

    for batch in tqdm(train_loader, total = len(train_loader)):
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        target_tensors = batch[2].to(device)

        # Student model predictions
        student_logits = student_model(input_ids=input_ids, attention_mask=attention_mask).logits
        ce_loss = entropy_loss(student_logits, target_tensors).data.item()
        ce_losses += ce_loss

        # We extract teacher model logits
        with torch.no_grad():
            teacher_outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask)
            teacher_logits = teacher_outputs.logits

        # Knowledge distillation loss (KD divergence)
        kd_loss = temperature ** 2 * criterion(
            F.log_softmax(student_logits / temperature, dim=-1),
            F.softmax(teacher_logits / temperature, dim=-1)
        )

        # Combined loss
        loss = alpha * ce_loss + (1. - alpha) * kd_loss
        loss.backward()
        optimizer.step()
        #scheduler.step()

        # Update training metrics
        train_kd_loss += kd_loss.data.item()
        train_loss += loss
        accuracy = accuracy_score(batch, student_model)
        train_accuracy += accuracy

    student_model.eval()
    for batch in tqdm(valid_loader):
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        target_tensors = batch[2].to(device)

        # Validation loss
        output = student_model(input_ids=input_ids, attention_mask=attention_mask)
        val_loss = entropy_loss(output.logits, target_tensors)
        valid_loss += val_loss.data.item()

        # Update validation accuracy
        accuracy = accuracy_score(batch, student_model)
        valid_accuracy += accuracy

    # Calculate average metrics
    train_accuracy /= len(train_loader)
    valid_accuracy /= len(valid_loader)
    train_loss /= len(train_loader)
    train_kd_loss /= len(train_loader)
    valid_loss /= len(valid_loader)
    ce_losses /= len(train_loader)

    # Append metrics to lists
    training_kd_loss_list.append(train_kd_loss)
    training_loss_list.append(train_loss.cpu().detach().numpy())
    training_accuracy_list.append(train_accuracy)
    valid_loss_list.append(valid_loss)
    valid_accuracy_list.append(valid_accuracy)
    ce_loss_list.append(ce_losses)

    # Print and store metrics
    print(f"""
    After epoch {epoch + 1}:
    Training loss (entropy): {ce_losses}
    Training loss (weighted): {train_loss}
    Kullback-Leibler (KL) divergence loss: {train_kd_loss}
    Validation loss (entropy): {valid_loss}
    Training accuracy: {train_accuracy}
    Validation accuracy: {valid_accuracy}
    """)

# Create a DataFrame to store the metrics
metrics = pd.DataFrame({
    'training_loss': training_loss_list,
    'training_ce_loss': ce_loss_list,
    'training_kd_loss': training_kd_loss_list,
    'training_accuracy': training_accuracy_list,
    'valid_loss': valid_loss_list,
    'valid_accuracy': valid_accuracy_list
})


In [None]:
metrics = pd.DataFrame({
    'training_loss': training_loss_list,
    'training_kd_loss': training_kd_loss_list,
    'training_accuracy': training_accuracy_list,
    'valid_loss': valid_loss_list,
    'valid_accuracy': valid_accuracy_list
})

metrics.head(10)

In [None]:
ce_loss

In [20]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
student_model.push_to_hub('odunola/distillbert-distilled-ag-news')
tokenizer.push_to_hub('odunola/distillbert-distilled-ag-news')

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/odunola/distillbert-distilled-ag-news/commit/2cd8c5f3b42e5d11101cb4c8b9fd31d914859874', commit_message='Upload tokenizer', commit_description='', oid='2cd8c5f3b42e5d11101cb4c8b9fd31d914859874', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
accuracy_teacher = 0.0
time_taken_teacher = 0.0

accuracy_student = 0.0
time_taken_student = 0.0
count = 0
for batch in tqdm(test_loader):
  start_time = perf_counter()
  score = accuracy_score(batch, teacher_model)
  end_time = perf_counter()
  accuracy_teacher += score
  time_taken_teacher += end_time - start_time

  start_time = perf_counter()
  score = accuracy_score(batch, student_model)
  end_time = perf_counter()
  accuracy_student += score
  time_taken_student += end_time - start_time


print('\n\n')
print(f"number of samples in each batch is {len(batch[0])}")
print(f'total number of batches is {len(test_loader)}')
print(f"teacher accuracy is {accuracy_teacher / len(test_loader):.2f}")
print(f'time taken per batch for teacher is {time_taken_teacher / len(test_loader):.6f}')
print('\n\n\n')
print(f"student accuracy is {accuracy_student / len(test_loader):.2f}")
print(f'time taken per batch for student is {time_taken_student / len(test_loader):.6f}')


100%|██████████| 119/119 [01:26<00:00,  1.38it/s]




number of samples in each batch is 48
total number of batches is 119
teacher accuracy is 0.94
time taken per batch for teacher is 0.478321




student accuracy is 0.94
time taken per batch for student is 0.240917





On CPU

In [None]:
teach_model = AutoModelForSequenceClassification.from_pretrained(teacher_model_ckpt)
stud_model = AutoModelForSequenceClassification.from_pretrained('odunola/student_distilled-financial-news')

In [None]:
from tqdm import tqdm
from time import perf_counter
device = 'cpu'

In [None]:
accuracy_teacher = 0.0
time_taken_teacher = 0.0

accuracy_student = 0.0
time_taken_student = 0.0
count = 0
for batch in tqdm(test_loader):
  start_time = perf_counter()
  score = accuracy_score(batch, teach_model)
  end_time = perf_counter()
  accuracy_teacher += score
  time_taken_teacher += end_time - start_time

  start_time = perf_counter()
  score = accuracy_score(batch, stud_model)
  end_time = perf_counter()
  accuracy_student += score
  time_taken_student += end_time - start_time


print('\n\n')
print(f"number of samples in each batch is {len(batch[0])}")
print(f'total number of batches is {len(test_loader)}')
print(f"teacher accuracy is {accuracy_teacher / len(test_loader):.2f}")
print(f'time taken per batch for teacher is {time_taken_teacher / len(test_loader):.6f}')
print('\n\n\n')
print(f"student accuracy is {accuracy_student / len(test_loader):.2f}")
print(f'time taken per batch for student is {time_taken_student / len(test_loader):.6f}')
