In [1]:
from datasets import load_dataset

ds = load_dataset("noor-zalouk/wiki-math-articles-multilabel")
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'category', 'title', 'sub_title'],
        num_rows: 56379
    })
    valid: Dataset({
        features: ['text', 'category', 'title', 'sub_title'],
        num_rows: 18699
    })
    test: Dataset({
        features: ['text', 'category', 'title', 'sub_title'],
        num_rows: 18790
    })
})

In [2]:
from sklearn.preprocessing import MultiLabelBinarizer

df = ds['test'].to_pandas()
all_labels = list(df['category'].explode().unique())
mlb = MultiLabelBinarizer()
mlb.fit([all_labels])

In [3]:
import torch

def prepare(row):
    text = row['title']
    if row['sub_title']:
        text = text + ' ' + row['sub_title']
    else:
        pass

    text = text + ' ' + row['text']

    label_ids = mlb.transform([row['category']])[0]

    return {'input': text, 'label_ids': torch.tensor(label_ids, dtype=torch.float)}

ds = ds.map(prepare)
ds = ds.remove_columns(['text', 'category', 'title', 'sub_title'])

Map:   0%|          | 0/18790 [00:00<?, ? examples/s]

In [4]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_ckpt = "./BERT_multilabel/checkpoint-10572"

config = AutoConfig.from_pretrained(model_ckpt)
config.id2label = {i: label for i, label in enumerate(all_labels)}
config.label2id = {label: i for i, label in enumerate(all_labels)}

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)
#pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, device=-1)

In [5]:
from pathlib import Path
from time import perf_counter
import numpy as np
from sklearn.metrics import classification_report


class PerformanceBenchmark:
    def __init__(self, pipeline, dataset, optim_type="BERT baseline"):
        self.pipeline = pipeline
        self.dataset = dataset
        self.optim_type = optim_type

    def compute_metrics(self):
        y_pred, y_true = [], []

        for example in self.dataset:
            true = example["label_ids"]
            pred = self.pipeline(example["input"], truncation=True, padding=True, max_length=512)
            scores = []
            
            for i in pred[0]:
                if i["score"] > 0.5:
                    scores.append(1)
                else:
                    scores.append(0) 

            y_pred.append(scores)
            y_true.append(true)
        
        clf_report = classification_report(y_true, y_pred, target_names=mlb.classes_, zero_division=0, output_dict=True)
        print(f"micro f1: {clf_report["micro avg"]["f1-score"]:.2f}, macro f1: {clf_report["macro avg"]["f1-score"]:.2f}")
        return {"micro f1": clf_report["micro avg"]["f1-score"], "macro f1": clf_report["macro avg"]["f1-score"]}
    
    def compute_size(self):
        state_dict = self.pipeline.model.state_dict()
        tmp_path = Path("model.pt")
        torch.save(state_dict, tmp_path)
        # Calculate size in megabytes
        size_mb = Path(tmp_path).stat().st_size / (1024 * 1024)
        # Delete temporary file
        tmp_path.unlink()
        print(f"Model size (MB) - {size_mb:.2f}")
        return {"size_mb": size_mb}
    
    def time_pipeline(self, query="This is the given quartic equation::x^4 - ax^3 + bx^2 - cx + d = 0"):
        latencies = []
        # Warmup
        for _ in range(10):
            _ = self.pipeline(query, truncation=True, padding=True, max_length=512)
        # Timed run
        for _ in range(100):
            start_time = perf_counter()
            _ = self.pipeline(query)
            latency = perf_counter() - start_time
            latencies.append(latency)
        # Compute run statistics
        time_avg_ms = 1000 * np.mean(latencies)
        time_std_ms = 1000 * np.std(latencies)
        print(f"Average latency (ms) - {time_avg_ms:.2f} +/- {time_std_ms:.2f}")
        return {"time_avg_ms": time_avg_ms, "time_std_ms": time_std_ms}

    def run_benchmark(self):
        metrics = {}
        metrics[self.optim_type] = self.compute_size()
        metrics[self.optim_type].update(self.time_pipeline())
        metrics[self.optim_type].update(self.compute_metrics())
        return metrics

In [68]:
pb = PerformanceBenchmark(pipe, ds["test"])
perf_metrics = pb.run_benchmark()

Model size (MB) - 417.79
Average latency (ms) - 38.23 +/- 5.89
micro f1: 0.84, macro f1: 0.80


In [6]:
from transformers import TrainingArguments

class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.temperature = temperature

In [7]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import Trainer

class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
    
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        outputs_stu = model(**inputs)
        # Extract cross-entropy loss and logits from student
        loss_ce = outputs_stu.loss
        logits_stu = outputs_stu.logits
        # Extract logits from teacher
        with torch.no_grad():
            outputs_tea = self.teacher_model(**inputs)
            logits_tea = outputs_tea.logits
        # Soften probabilities and compute distillation loss
        loss_fct = nn.KLDivLoss(reduction="batchmean")
        loss_kd = self.args.temperature ** 2 * loss_fct(F.log_softmax(logits_stu / self.args.temperature, dim=-1),
                                                        F.softmax(logits_tea / self.args.temperature, dim=-1))
        # Return weighted student loss
        loss = (self.args.alpha * loss_ce) + ((1. - self.args.alpha) * loss_kd)
        return (loss, outputs_stu) if return_outputs else loss

In [8]:
student_ckpt = "distilbert-base-uncased"
student_config = AutoConfig.from_pretrained(student_ckpt, num_labels=config.num_labels,
                                            id2label=config.id2label, label2id=config.label2id)
student_tokenizer = AutoTokenizer.from_pretrained(student_ckpt)

def student_init():
    return (AutoModelForSequenceClassification.from_pretrained(
            student_ckpt, config=student_config).to(device))

In [9]:
def tokenize_text(batch):
    return student_tokenizer(batch["input"], truncation=True)

ds_enc = ds.map(tokenize_text, batched=True, remove_columns=["input"])

Map:   0%|          | 0/18790 [00:00<?, ? examples/s]

In [10]:
from scipy.special import expit as sigmoid

def compute_metrics(pred):
    y_true = pred.label_ids
    y_pred = sigmoid(pred.predictions)
    y_pred = (y_pred>0.5).astype(float)
    clf_report = classification_report(y_true, y_pred, target_names=mlb.classes_, zero_division=0, output_dict=True)
    return {"micro f1": clf_report["micro avg"]["f1-score"], "macro f1": clf_report["macro avg"]["f1-score"]}

In [14]:
student_training_args = DistillationTrainingArguments(
    output_dir="./DistilBERT_multilabel", eval_strategy = "epoch",
    num_train_epochs=6, learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=16,
    alpha=0.5, temperature=2, weight_decay=0.01)

In [15]:
distilbert_trainer = DistillationTrainer(
                     model_init=student_init,
                     teacher_model=model,
                     args=student_training_args,
                     train_dataset=ds_enc['train'],
                     eval_dataset=ds_enc['valid'],
                     compute_metrics=compute_metrics,
                     processing_class=student_tokenizer)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
distilbert_trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


In [None]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.feature_extraction.text import CountVectorizer

y_train = np.array(ds["train"]["category_ids"])
y_test = np.array(ds["test"]["category_ids"])
# Use a simple count vectorizer to encode our texts as token counts
count_vect = CountVectorizer(max_features=25000)
X_train_counts = count_vect.fit_transform(ds["train"]["text"])
X_test_counts = count_vect.transform(ds["test"]["text"])
# Create and train our model!
classifier = BinaryRelevance(classifier=MultinomialNB())
classifier.fit(X_train_counts, y_train)
# Generate predictions and evaluate
y_pred_test = classifier.predict(X_test_counts)
clf_report = classification_report(y_test, y_pred_test, target_names=mlb.classes_, zero_division=0, output_dict=True)

In [5]:
clf_report["macro avg"]["f1-score"], clf_report["micro avg"]["f1-score"]

(0.40407743639350235, 0.43033456042674456)