In [1]:
from datasets import load_dataset

ds = load_dataset("noor-zalouk/wiki-math-articles-multilabel")
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'category', 'title', 'sub_title'],
        num_rows: 56379
    })
    valid: Dataset({
        features: ['text', 'category', 'title', 'sub_title'],
        num_rows: 18699
    })
    test: Dataset({
        features: ['text', 'category', 'title', 'sub_title'],
        num_rows: 18790
    })
})

In [2]:
import pandas as pd

df = ds['test'].to_pandas()
all_labels = list(df['category'].explode().unique())

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb.fit([all_labels])

def prepare_labels(batch):
    batch["category_ids"] = mlb.transform(batch["category"])
    return batch

ds = ds.map(prepare_labels, batched=True)

In [4]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.feature_extraction.text import CountVectorizer

y_train = np.array(ds["train"]["category_ids"])
y_test = np.array(ds["test"]["category_ids"])
# Use a simple count vectorizer to encode our texts as token counts
count_vect = CountVectorizer(max_features=25000)
X_train_counts = count_vect.fit_transform(ds["train"]["text"])
X_test_counts = count_vect.transform(ds["test"]["text"])
# Create and train our model!
classifier = BinaryRelevance(classifier=MultinomialNB())
classifier.fit(X_train_counts, y_train)
# Generate predictions and evaluate
y_pred_test = classifier.predict(X_test_counts)
clf_report = classification_report(y_test, y_pred_test, target_names=mlb.classes_, zero_division=0, output_dict=True)

In [5]:
clf_report["macro avg"]["f1-score"], clf_report["micro avg"]["f1-score"]

(0.40407743639350235, 0.43033456042674456)