In [None]:
from datasets import load_dataset

# Load the dataset
ds = load_dataset("noor-zalouk/wiki-math-articles-multilabel")
ds

DatasetDict({
    train: Dataset({
        features: ['input', 'labels'],
        num_rows: 60043
    })
    valid: Dataset({
        features: ['input', 'labels'],
        num_rows: 19902
    })
    test: Dataset({
        features: ['input', 'labels'],
        num_rows: 19953
    })
})

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Get all unique labels and prepare the MultiLabelBinarizer
df = ds['test'].to_pandas()
all_labels = list(df['labels'].explode().unique())
mlb = MultiLabelBinarizer()
mlb.fit([all_labels])

0,1,2
,classes,
,sparse_output,False


In [None]:
import torch

# Prepare the dataset
def prepare(row):
    label_ids = mlb.transform([row['labels']])[0]

    return {'label_ids': torch.tensor(label_ids, dtype=torch.float)}

ds = ds.map(prepare)
ds = ds.remove_columns(['labels'])

Map:   0%|          | 0/60043 [00:00<?, ? examples/s]

Map:   0%|          | 0/19902 [00:00<?, ? examples/s]

Map:   0%|          | 0/19953 [00:00<?, ? examples/s]

In [None]:
import numpy as np
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.feature_extraction.text import CountVectorizer

# Use a simple Naive Bayes classifier with Binary Relevance for multi-label classification
# Extract features and labels
y_train = np.array(ds["train"]["label_ids"])
y_test = np.array(ds["test"]["label_ids"])
# Use a simple count vectorizer to encode our texts as token counts
count_vect = CountVectorizer(max_features=25000)
X_train_counts = count_vect.fit_transform(ds["train"]["input"])
X_test_counts = count_vect.transform(ds["test"]["input"])
# Create and train our model!
classifier = BinaryRelevance(classifier=MultinomialNB())
classifier.fit(X_train_counts, y_train)
# Generate predictions and evaluate
y_pred_test = classifier.predict(X_test_counts)
clf_report = classification_report(y_test, y_pred_test, target_names=mlb.classes_, zero_division=0, output_dict=True)

In [6]:
clf_report["macro avg"]["f1-score"], clf_report["micro avg"]["f1-score"]

(0.42876269365396574, 0.45392398671612516)

In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline

# Load the fine-tuned DistilBERT model for multi-label classification
model_ckpt = "./DistilBERT_multilabel/checkpoint-8451"

config = AutoConfig.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, device=-1)

Device set to use cpu


In [None]:
from pathlib import Path
from time import perf_counter

# Benchmarking class to evaluate model performance
class PerformanceBenchmark:
    def __init__(self, pipeline, dataset, optim_type="BERT baseline"):
        self.pipeline = pipeline
        self.dataset = dataset
        self.optim_type = optim_type

    def compute_metrics(self):
        y_pred, y_true = [], []

        for example in self.dataset:
            # Get true labels and model predictions
            true = example["label_ids"]
            pred = self.pipeline(example["input"], truncation=True, padding=True, max_length=512)
            scores = []
            
            for i in pred[0]:
                if i["score"] > 0.5:
                    scores.append(1)
                else:
                    scores.append(0) 

            y_pred.append(scores)
            y_true.append(true)
        
        # Compute classification report
        clf_report = classification_report(y_true, y_pred, target_names=mlb.classes_, zero_division=0, output_dict=True)
        print(f"micro f1: {clf_report["micro avg"]["f1-score"]:.2f}, macro f1: {clf_report["macro avg"]["f1-score"]:.2f}")
        return {"micro f1": clf_report["micro avg"]["f1-score"], "macro f1": clf_report["macro avg"]["f1-score"]}
    
    def compute_size(self):
        # Save model state dict to a temporary file
        state_dict = self.pipeline.model.state_dict()
        tmp_path = Path("model.pt")
        torch.save(state_dict, tmp_path)
        # Calculate size in megabytes
        size_mb = Path(tmp_path).stat().st_size / (1024 * 1024)
        # Delete temporary file
        tmp_path.unlink()
        print(f"Model size (MB) - {size_mb:.2f}")
        return {"size_mb": size_mb}
    
    def time_pipeline(self, query="This is the given quartic equation::x^4 - ax^3 + bx^2 - cx + d = 0"):
        latencies = []
        # Warmup
        for _ in range(10):
            _ = self.pipeline(query, truncation=True, padding=True, max_length=512)
        # Timed run
        for _ in range(100):
            start_time = perf_counter()
            _ = self.pipeline(query)
            latency = perf_counter() - start_time
            latencies.append(latency)
        # Compute run statistics
        time_avg_ms = 1000 * np.mean(latencies)
        time_std_ms = 1000 * np.std(latencies)
        print(f"Average latency (ms) - {time_avg_ms:.2f} +/- {time_std_ms:.2f}")
        return {"time_avg_ms": time_avg_ms, "time_std_ms": time_std_ms}

    def run_benchmark(self):
        metrics = {}
        metrics[self.optim_type] = self.compute_size()
        metrics[self.optim_type].update(self.time_pipeline())
        metrics[self.optim_type].update(self.compute_metrics())
        return metrics

In [None]:
# Run the performance benchmark
pb = PerformanceBenchmark(pipe, ds["test"])
perf_metrics = pb.run_benchmark()

Model size (MB) - 255.52
Average latency (ms) - 14.31 +/- 0.51
micro f1: 0.86, macro f1: 0.84
