In [11]:
import pandas as pd
import numpy as np
from sklearn import metrics
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import transformers
from transformers import (AutoModelForSequenceClassification, AutoTokenizer, 
                          Trainer, TrainingArguments)
import time
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from torch.nn.utils import prune

In [12]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print("Using device:", device)

Using device: cuda


### Preporcess Data and Define Metrics

In [13]:
from datasets import load_dataset

dataset = load_dataset("sem_eval_2018_task_1", "subtask5.english")
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 6838
    })
    test: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 3259
    })
    validation: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 886
    })
})

In [14]:
# prepare the labels dataset for inference
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'Tweet']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'love',
 'optimism',
 'pessimism',
 'sadness',
 'surprise',
 'trust']

In [15]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["Tweet"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [16]:
# preprocess data
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")

In [17]:
batch_size = 8
metric_name = "f1"

In [18]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

### Load the Fine-Tuned BERT Model

In [19]:
fine_tuned_path = "/workspaces/LLM-Experimentation-Capstone/10_code/BERT_experiments/bert_base_model/bert-pruned-sem_eval-english"

In [20]:
model_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_path)
model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_path, 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id).to(device)

In [21]:
def get_model_size(model):
    param_size = sum(p.nelement() * p.element_size() for p in model.parameters())
    buffer_size = sum(b.nelement() * b.element_size() for b in model.buffers())
    model_size = (param_size + buffer_size) / 1024**2
    return model_size

base_model_size = get_model_size(model)  # Before pruning
print('Base model size (before pruning): {:.3f}MB'.format(base_model_size))

Base model size (before pruning): 417.682MB


### Apply Pruning 

In [22]:
parameters_to_prune = (
(model.bert.embeddings.word_embeddings, 'weight'),
(model.bert.encoder.layer[0].attention.self.query, 'weight'),
(model.bert.encoder.layer[0].attention.self.key, 'weight'),
(model.bert.encoder.layer[0].attention.self.value, 'weight'),
(model.bert.encoder.layer[0].attention.output.dense, 'weight'),
(model.bert.encoder.layer[0].intermediate.dense, 'weight'),
(model.bert.encoder.layer[0].output.dense, 'weight'),
)

In [23]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='"bert-pruned-sem_eval-english"',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [24]:
pruning_weights = [0.2, 0.4, 0.6]
results = []

for pruning_weight in pruning_weights:
    # Apply pruning
    prune.global_unstructured(
        parameters_to_prune,
        pruning_method=prune.L1Unstructured,
        amount=pruning_weight,
    )
    
    # Fine-tune pruned model
    pruned_model = model
    trainer = Trainer(
        model=pruned_model,
        args=training_args,
        train_dataset=encoded_dataset["train"],
        eval_dataset=encoded_dataset["validation"],
        tokenizer=model_tokenizer,
        compute_metrics=compute_metrics
    )
    
    train_start = time.time()
    trainer.train()
    train_end = time.time()
    training_time = train_end - train_start
    
    # Evaluate pruned model
    eval_start = time.time()
    evaluation_metrics = trainer.evaluate()
    eval_end = time.time()
    evaluation_time = eval_end - eval_start
    
    # Record inference time
    input = [x['Tweet'] for x in dataset['validation']]
    pruned_inference_times = []
    for input_ in input:
        inputs = tokenizer(input_, return_tensors="pt", padding=True, truncation=True).to(device)
        start_time = time.time()
        pruned_model(**inputs).logits
        pruned_inference_times.append(time.time() - start_time)
    
    pruned_inference_times_series = pd.Series(pruned_inference_times)
    inference_time_stats = pruned_inference_times_series.describe()
    
    # Calculate model size after pruning
    for module, name in parameters_to_prune:
        prune.remove(module, name)  # Make pruning permanent
    pruned_model_size = get_model_size(pruned_model)
    
    # Save results
    results.append({
        'pruning_weight': pruning_weight,
        'training_time': training_time,
        'evaluation_time': evaluation_time,
        'inference_time_stats': inference_time_stats,
        'model_size_mb': pruned_model_size,
        'evaluation_metrics': evaluation_metrics
    })
    
    # Save pruned model
    pruned_model_path = f"./bert-pruned-{pruning_weight}-sem_eval-english"
    pruned_model.save_pretrained(pruned_model_path)
    model_tokenizer.save_pretrained(pruned_model_path)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.1335,0.34879,0.701719,0.796507,0.273138
2,0.1072,0.374364,0.702041,0.802541,0.24605
3,0.1055,0.388134,0.694524,0.79785,0.23702
4,0.0889,0.396611,0.698535,0.800294,0.233634
5,0.0701,0.400799,0.699952,0.800523,0.247178




Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.0813,0.409497,0.699833,0.802546,0.256208
2,0.0597,0.443452,0.697936,0.803745,0.244921
3,0.0774,0.441037,0.690786,0.794875,0.225734
4,0.0603,0.451559,0.692783,0.798551,0.23702
5,0.0441,0.457142,0.69723,0.80043,0.241535




Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.0776,0.446041,0.693484,0.800439,0.23702
2,0.0605,0.473147,0.690207,0.798259,0.234763
3,0.0763,0.46875,0.683886,0.793267,0.222348
4,0.0576,0.480532,0.692921,0.80064,0.240406
5,0.0421,0.483564,0.691704,0.797691,0.234763


In [25]:
for result in results:
    print(f"Pruning weight: {result['pruning_weight']}")
    print(f"Training time: {result['training_time']}")
    print(f"Evaluation time: {result['evaluation_time']}")
    print("Inference time statistics:")
    print(result['inference_time_stats'])
    print(f"Model size (MB): {result['model_size_mb']}")
    print(f"Evaluation metrics: {result['evaluation_metrics']}")
    print("-----")

Pruning weight: 0.2
Training time: 389.3823115825653
Evaluation time: 2.5104432106018066
Inference time statistics:
count    886.000000
mean       0.014701
std        0.001159
min        0.013908
25%        0.014330
50%        0.014458
75%        0.014677
max        0.029917
dtype: float64
Model size (MB): 417.6816825866699
Evaluation metrics: {'eval_loss': 0.3743639588356018, 'eval_f1': 0.7020408163265306, 'eval_roc_auc': 0.8025409084212032, 'eval_accuracy': 0.24604966139954854, 'eval_runtime': 2.5086, 'eval_samples_per_second': 353.191, 'eval_steps_per_second': 44.249, 'epoch': 5.0}
-----
Pruning weight: 0.4
Training time: 404.18692111968994
Evaluation time: 2.521122455596924
Inference time statistics:
count    886.000000
mean       0.014660
std        0.000796
min        0.013940
25%        0.014364
50%        0.014486
75%        0.014676
max        0.024588
dtype: float64
Model size (MB): 417.6816825866699
Evaluation metrics: {'eval_loss': 0.4094970226287842, 'eval_f1': 0.699833373