In [84]:
import pandas as pd
import numpy as np
from sklearn import metrics
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import transformers
from transformers import BertTokenizer, BertModel, BertConfig, DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig, DistilBertModel
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from transformers import TrainingArguments
import time
import torch.nn as nn
import torch.nn.functional as F
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
from torch.nn.utils import prune

In [85]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print("Using device:", device)

Using device: cuda


### Preporcess Data and Define Metrics

In [86]:
from datasets import load_dataset

dataset = load_dataset("sem_eval_2018_task_1", "subtask5.english")
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 6838
    })
    test: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 3259
    })
    validation: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 886
    })
})

In [87]:
# prepare the labels dataset for inference
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'Tweet']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'love',
 'optimism',
 'pessimism',
 'sadness',
 'surprise',
 'trust']

In [88]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["Tweet"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [89]:
# preprocess data
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")

In [90]:
batch_size = 8
metric_name = "f1"

In [91]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

### Load the Fine-Tuned BERT Model

In [92]:
fine_tuned_path = "../bert_base_model/bert-finetuned-sem_eval-english"

In [93]:
moodel_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path)
model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_path, 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id).to(device)

In [94]:
def get_model_size(model):
    param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
    model_size = (param_size + buffer_size) / 1024**2
    return model_size

base_model_size = get_model_size(model)  # Before pruning
print('Base model size (before pruning): {:.3f}MB'.format(base_model_size))

Base model size (before pruning): 417.682MB


### Apply Pruning 

In [95]:
import torch.nn.utils.prune as prune

# magnitude pruning :
parameters_to_prune = (
    (model.bert.embeddings.word_embeddings, 'weight'),
    (model.bert.encoder.layer[0].attention.self.query, 'weight'),
    (model.bert.encoder.layer[0].attention.self.key, 'weight'),
    (model.bert.encoder.layer[0].attention.self.value, 'weight'),
    (model.bert.encoder.layer[0].attention.output.dense, 'weight'),
    (model.bert.encoder.layer[0].intermediate.dense, 'weight'),
    (model.bert.encoder.layer[0].output.dense, 'weight'),
)


prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.2,  # Prune 20% of the weights
)

### Fine-Tune the Pruned Model:

In [96]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

In [97]:
pruned_model = model

In [98]:
# Assuming you've already defined your TrainingArguments, tokenizer, and compute_metrics function:
trainer = Trainer(
    model=pruned_model,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

train_start = time.time()
trainer.train()
train_end = time.time()
print("Training time: {}".format(train_end - train_start))

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.1991,0.335813,0.691836,0.793048,0.261851
2,0.1597,0.350662,0.702406,0.803999,0.260722
3,0.1293,0.360164,0.685799,0.788055,0.243792
4,0.114,0.367959,0.687685,0.789841,0.248307
5,0.0993,0.373168,0.690686,0.792387,0.242664


Training time: 395.40200328826904


### Evaluate the Pruned Model

In [109]:
eval_start = time.time()
print(trainer.evaluate())
eval_end = time.time()
print("Evaluation time: {}".format(eval_end - eval_start))

{'eval_loss': 0.35066232085227966, 'eval_f1': 0.702406480819633, 'eval_roc_auc': 0.8039990092235552, 'eval_accuracy': 0.26072234762979685, 'eval_runtime': 2.6063, 'eval_samples_per_second': 339.941, 'eval_steps_per_second': 42.589, 'epoch': 5.0}
Evaluation time: 2.6098670959472656


### Remove the Pruned Weights

In [100]:
for module, name in parameters_to_prune:
    prune.remove(module, name)

### Evaluation of Pruned Model Size and Inference Time

In [101]:
pruned_model_size = get_model_size(pruned_model)  # After pruning
print('Model size (after pruning): {:.3f}MB'.format(pruned_model_size))

Model size (after pruning): 417.682MB


In [102]:
input = [x['Tweet'] for x in dataset['validation']]

pruned_predictions = []
pruned_times = []

# In the inference loop
for input_ in input:
    # Tokenize the input and move to the appropriate device
    inputs = tokenizer(input_, return_tensors="pt", padding=True, truncation=True).to(device) 
    st = time.time()
    pruned_output = pruned_model(**inputs).logits
    pruned_predictions.append(pruned_output.squeeze().detach().cpu().numpy())
    pruned_times.append(time.time() - st)
    
    
print(pd.Series(pruned_times).describe().T)
ground_truth_labels = np.array([[example[label] for label in labels] for example in dataset['validation']])
print(multi_label_metrics(pruned_predictions, ground_truth_labels))

count    886.000000
mean       0.013667
std        0.001162
min        0.012043
25%        0.012939
50%        0.013609
75%        0.014002
max        0.025277
dtype: float64
{'f1': 0.702406480819633, 'roc_auc': 0.8039990092235552, 'accuracy': 0.26072234762979685}


### Save the Pruned Model

In [103]:
pruned_model_path = "./bert-pruned-sem_eval-english"
pruned_model.save_pretrained(pruned_model_path)