In [21]:
import pandas as pd
import numpy as np
from sklearn import metrics
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import transformers
from transformers import (AutoModelForSequenceClassification, AutoTokenizer, 
                          Trainer, TrainingArguments)
import time
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from torch.nn.utils import prune

In [22]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print("Using device:", device)

Using device: cuda


### Preporcess Data and Define Metrics

In [23]:
from datasets import load_dataset

dataset = load_dataset("sem_eval_2018_task_1", "subtask5.english")
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 6838
    })
    test: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 3259
    })
    validation: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 886
    })
})

In [24]:
# prepare the labels dataset for inference
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'Tweet']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'love',
 'optimism',
 'pessimism',
 'sadness',
 'surprise',
 'trust']

In [25]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["Tweet"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [26]:
# preprocess data
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")

In [27]:
batch_size = 8
metric_name = "f1"

In [28]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

### Load the Fine-Tuned BERT Model

In [29]:
fine_tuned_path = "/workspaces/LLM-Experimentation-Capstone/bert-student-distilled-model"

In [30]:
model_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path)
model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_path, 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id).to(device)

In [31]:
def get_model_size(model):
    param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
    model_size = (param_size + buffer_size) / 1024**2
    return model_size

base_model_size = get_model_size(model)  # Before pruning
print('Base model size (before pruning): {:.3f}MB'.format(base_model_size))

Base model size (before pruning): 255.443MB


### Apply Pruning 

In [32]:
import torch.nn.utils.prune as prune

# magnitude pruning :
parameters_to_prune = (
    (model.distilbert.embeddings.word_embeddings, 'weight'),
    (model.distilbert.transformer.layer[0].attention.q_lin, 'weight'),
    (model.distilbert.transformer.layer[0].attention.k_lin, 'weight'),
    (model.distilbert.transformer.layer[0].attention.v_lin, 'weight'),
    (model.distilbert.transformer.layer[0].attention.out_lin, 'weight'),
    (model.distilbert.transformer.layer[0].ffn.lin1, 'weight'),
    (model.distilbert.transformer.layer[0].ffn.lin2, 'weight'),
)

prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.2,  # Prune 20% of the weights
)

### Fine-Tune the Pruned Model:

In [33]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='"bert-distilled-pruned-sem_eval-english"',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [34]:
distilled_pruned_model = model

In [35]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=model_tokenizer,
    compute_metrics=compute_metrics
)

train_start = time.time()
trainer.train()
train_end = time.time()
print("Training time: {}".format(train_end - train_start))

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.1587,0.381841,0.684857,0.788422,0.240406
2,0.1211,0.405909,0.68709,0.791401,0.22912
3,0.1225,0.412667,0.67804,0.78552,0.230248
4,0.1126,0.43074,0.6801,0.783856,0.224605
5,0.0929,0.434801,0.682266,0.786571,0.230248


Training time: 214.18993425369263


### Evaluate the Pruned Model

In [41]:
eval_start = time.time()
print(trainer.evaluate())
eval_end = time.time()
print("Evaluation time: {}".format(eval_end - eval_start))

{'eval_loss': 0.4059094786643982, 'eval_f1': 0.6870897155361051, 'eval_roc_auc': 0.7914013233332429, 'eval_accuracy': 0.2291196388261851, 'eval_runtime': 1.5335, 'eval_samples_per_second': 577.777, 'eval_steps_per_second': 72.385, 'epoch': 5.0}
Evaluation time: 1.5361683368682861


### Remove the Pruned Weights

In [37]:
for module, name in parameters_to_prune:
    prune.remove(module, name)

### Evaluation of Pruned Model Size and Inference Time

In [38]:
distilled_pruned_model_size = get_model_size(distilled_pruned_model)  # After pruning
print('Model size (after pruning): {:.3f}MB'.format(distilled_pruned_model_size))

Model size (after pruning): 255.443MB


In [39]:
input = [x['Tweet'] for x in dataset['validation']]

distilled_pruned_predictions = []
distilled_pruned_times = []

# In the inference loop
for input_ in input:
    # Tokenize the input and move to the appropriate device
    inputs = tokenizer(input_, return_tensors="pt", padding=True, truncation=True, return_token_type_ids=False).to(device)
    st = time.time()
    distilled_pruned_output = distilled_pruned_model(**inputs).logits
    distilled_pruned_predictions.append(distilled_pruned_output.squeeze().detach().cpu().numpy())
    distilled_pruned_times.append(time.time() - st)
    
    
print(pd.Series(distilled_pruned_times).describe().T)
ground_truth_labels = np.array([[example[label] for label in labels] for example in dataset['validation']])
print(multi_label_metrics(distilled_pruned_predictions, ground_truth_labels))

count    886.000000
mean       0.007914
std        0.000625
min        0.007486
25%        0.007761
50%        0.007855
75%        0.007948
max        0.018200
dtype: float64
{'f1': 0.6870897155361051, 'roc_auc': 0.7914013233332429, 'accuracy': 0.2291196388261851}


### Save the Pruned Model

In [40]:
distilled_pruned_model_path = "./bert-distilled-pruned-sem_eval-english"
distilled_pruned_model.save_pretrained(distilled_pruned_model_path)
model_tokenizer.save_pretrained(distilled_pruned_model_path)

('./bert-distilled-pruned-sem_eval-english/tokenizer_config.json',
 './bert-distilled-pruned-sem_eval-english/special_tokens_map.json',
 './bert-distilled-pruned-sem_eval-english/vocab.txt',
 './bert-distilled-pruned-sem_eval-english/added_tokens.json',
 './bert-distilled-pruned-sem_eval-english/tokenizer.json')