In [11]:
import pandas as pd
import numpy as np
from sklearn import metrics
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import transformers
from transformers import (AutoModelForSequenceClassification, AutoTokenizer, 
                          Trainer, TrainingArguments)
import time
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from torch.nn.utils import prune

In [12]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print("Using device:", device)

Using device: cuda


### Preporcess Data and Define Metrics

In [13]:
from datasets import load_dataset

dataset = load_dataset("sem_eval_2018_task_1", "subtask5.english")
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 6838
    })
    test: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 3259
    })
    validation: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 886
    })
})

In [14]:
# prepare the labels dataset for inference
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'Tweet']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'love',
 'optimism',
 'pessimism',
 'sadness',
 'surprise',
 'trust']

In [15]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["Tweet"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [16]:
# preprocess data
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")

In [17]:
batch_size = 8
metric_name = "f1"

In [18]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

### Load the Fine-Tuned BERT Model

In [19]:
fine_tuned_path = "/workspaces/LLM-Experimentation-Capstone/10_code/BERT_experiments/bert_base_model/bert-pruned-sem_eval-english"

In [20]:
model_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path)
model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_path, 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id).to(device)

In [21]:
def get_model_size(model):
    param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
    model_size = (param_size + buffer_size) / 1024**2
    return model_size

base_model_size = get_model_size(model)  # Before pruning
print('Base model size (before pruning): {:.3f}MB'.format(base_model_size))

Base model size (before pruning): 417.682MB


### Apply Pruning 

In [22]:
import torch.nn.utils.prune as prune

# magnitude pruning :
parameters_to_prune = (
    (model.bert.embeddings.word_embeddings, 'weight'),
    (model.bert.encoder.layer[0].attention.self.query, 'weight'),
    (model.bert.encoder.layer[0].attention.self.key, 'weight'),
    (model.bert.encoder.layer[0].attention.self.value, 'weight'),
    (model.bert.encoder.layer[0].attention.output.dense, 'weight'),
    (model.bert.encoder.layer[0].intermediate.dense, 'weight'),
    (model.bert.encoder.layer[0].output.dense, 'weight'),
)

In [23]:
def apply_pruning(pruning_weight):
    prune.global_unstructured(
        parameters_to_prune,
        pruning_method=prune.L1Unstructured,
        amount=pruning_weight,
    )
    
    # Make pruning permanent
    for module, name in parameters_to_prune:
        prune.remove(module, name)



### Fine-Tune the Pruned Model:

In [24]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./bert-iterative-pruned-sem_eval-english',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [25]:
pruning_steps = 3  # Number of times to prune and retrain
target_pruning_weight = 0.4  # Target pruning to achieve by the end
pruning_weights = np.linspace(0, target_pruning_weight, pruning_steps + 1)[1:]  # Excludes 0 and includes the target weight
results = []

In [26]:
for step, pruning_weight in enumerate(pruning_weights):
    # Apply pruning
    apply_pruning(pruning_weight)
    
    # Fine-tune pruned model
    pruned_model = model
    trainer = Trainer(
        model=pruned_model,
        args=training_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["validation"],
        tokenizer=model_tokenizer,
        compute_metrics=compute_metrics
    )
    
    train_start = time.time()
    trainer.train()
    train_end = time.time()
    training_time = train_end - train_start
    
    # Evaluate pruned model
    eval_start = time.time()
    evaluation_metrics = trainer.evaluate()
    eval_end = time.time()
    evaluation_time = eval_end - eval_start
    
    # Record inference time
    input = [x['Tweet'] for x in dataset['validation']]
    pruned_inference_times = []
    for input_ in input:
        inputs = tokenizer(input_, return_tensors="pt", padding=True, truncation=True).to(device)
        start_time = time.time()
        pruned_model(**inputs).logits
        pruned_inference_times.append(time.time() - start_time)
    
    pruned_inference_times_series = pd.Series(pruned_inference_times)
    inference_time_stats = pruned_inference_times_series.describe()
    
    # Calculate model size after pruning
    pruned_model_size = get_model_size(pruned_model)
    
    # Save results
    results.append({
        'step': step,
        'pruning_weight': pruning_weight,
        'training_time': training_time,
        'evaluation_time': evaluation_time,
        'inference_time_stats': inference_time_stats,
        'model_size_mb': pruned_model_size,
        'evaluation_metrics': evaluation_metrics
    })
    
    # Save pruned model
    pruned_model_path = f"./bert-iterative-pruned-step-{step}-sem_eval-english"
    pruned_model.save_pretrained(pruned_model_path)
    model_tokenizer.save_pretrained(pruned_model_path)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.1338,0.34849,0.701021,0.796243,0.274266
2,0.1077,0.374184,0.701872,0.802475,0.243792
3,0.1056,0.387396,0.692474,0.796396,0.234763
4,0.0886,0.396101,0.698077,0.799898,0.23702
5,0.0696,0.400446,0.699831,0.800258,0.248307




Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.0795,0.409612,0.693475,0.796791,0.241535
2,0.0583,0.442998,0.697587,0.803712,0.233634
3,0.0766,0.439126,0.691548,0.795602,0.214447
4,0.0587,0.453379,0.695859,0.800102,0.234763
5,0.042,0.458499,0.699785,0.801983,0.242664




Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.0298,0.51389,0.691679,0.795867,0.244921
2,0.0252,0.531828,0.694266,0.799144,0.232506
3,0.0259,0.544919,0.697762,0.802916,0.244921
4,0.018,0.563744,0.689574,0.797896,0.222348
5,0.0154,0.565397,0.691905,0.797558,0.226862


In [27]:
for result in results:
    print(f"Iterative pruning step: {result['step']}")
    print(f"Pruning weight: {result['pruning_weight']}")
    print(f"Training time: {result['training_time']}")
    print(f"Evaluation time: {result['evaluation_time']}")
    print("Inference time statistics:")
    print(result['inference_time_stats'])
    print(f"Model size (MB): {result['model_size_mb']}")
    print(f"Evaluation metrics: {result['evaluation_metrics']}")
    print("-----")

Iterative pruning step: 0
Pruning weight: 0.13333333333333333
Training time: 389.447794675827
Evaluation time: 2.47590970993042
Inference time statistics:
count    886.000000
mean       0.014522
std        0.000781
min        0.013614
25%        0.014176
50%        0.014350
75%        0.014581
max        0.022823
dtype: float64
Model size (MB): 417.6816825866699
Evaluation metrics: {'eval_loss': 0.3741838037967682, 'eval_f1': 0.7018722995679308, 'eval_roc_auc': 0.8024749888430884, 'eval_accuracy': 0.24379232505643342, 'eval_runtime': 2.474, 'eval_samples_per_second': 358.12, 'eval_steps_per_second': 44.866, 'epoch': 5.0}
-----
Iterative pruning step: 1
Pruning weight: 0.26666666666666666
Training time: 399.1161663532257
Evaluation time: 2.484055757522583
Inference time statistics:
count    886.000000
mean       0.014680
std        0.000831
min        0.013822
25%        0.014380
50%        0.014502
75%        0.014660
max        0.024030
dtype: float64
Model size (MB): 417.681682586669