In [29]:
from datasets import load_dataset
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
import os

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

In [30]:
ds = load_dataset("noor-zalouk/wiki-math-articles-multilabel")
print("Dataset loaded")

Dataset loaded


In [31]:
import pandas as pd

df = ds['test'].to_pandas()
all_labels = list(df['category'].explode().unique())

In [32]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb.fit([all_labels])

In [33]:
model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config)
config = AutoConfig.from_pretrained(model_ckpt)
config.num_labels = len(all_labels)
config.problem_type = "multi_label_classification"

In [34]:
def prepare(row):
    text = row['title']
    if row['sub_title']:
        text = text + ' ' + row['sub_title']
    else:
        pass

    text = text + ' ' + row['text']

    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=512)
    label_ids = mlb.transform([row['category']])[0]

    inputs['label_ids'] = torch.tensor(label_ids, dtype=torch.float)

    return inputs

In [35]:
ds = ds.map(prepare)

Map:   0%|          | 0/56379 [00:00<?, ? examples/s]

Map:   0%|          | 0/18699 [00:00<?, ? examples/s]

Map:   0%|          | 0/18790 [00:00<?, ? examples/s]

In [36]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'category', 'title', 'sub_title', 'input_ids', 'token_type_ids', 'attention_mask', 'label_ids'],
        num_rows: 56379
    })
    valid: Dataset({
        features: ['text', 'category', 'title', 'sub_title', 'input_ids', 'token_type_ids', 'attention_mask', 'label_ids'],
        num_rows: 18699
    })
    test: Dataset({
        features: ['text', 'category', 'title', 'sub_title', 'input_ids', 'token_type_ids', 'attention_mask', 'label_ids'],
        num_rows: 18790
    })
})

In [37]:
ds = ds.remove_columns(['text', 'category', 'title', 'sub_title'])


In [38]:
ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label_ids'],
        num_rows: 56379
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label_ids'],
        num_rows: 18699
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'label_ids'],
        num_rows: 18790
    })
})

In [39]:
from transformers import Trainer, TrainingArguments

training_args_fine_tune = TrainingArguments(
    output_dir="./BERT_multilabel", num_train_epochs=4, learning_rate=5e-5,
    per_device_train_batch_size=4, per_device_eval_batch_size=32, weight_decay=0.01,
    gradient_accumulation_steps=8,
    eval_strategy="epoch", save_strategy="epoch",logging_steps=100,
    load_best_model_at_end=True, metric_for_best_model='micro f1',
    save_total_limit=1, log_level='error')

In [40]:
from scipy.special import expit as sigmoid
from sklearn.metrics import classification_report


def compute_metrics(pred):
    y_true = pred.label_ids
    y_pred = sigmoid(pred.predictions)
    y_pred = (y_pred>0.5).astype(float)
    clf_dict = classification_report(y_true, y_pred, target_names=all_labels, zero_division=0, output_dict=True)
    return {"micro f1": clf_dict["micro avg"]["f1-score"], "macro f1": clf_dict["macro avg"]["f1-score"]}

In [41]:
trainer = Trainer(model=model, tokenizer=tokenizer,
                      args=training_args_fine_tune,
                      compute_metrics=compute_metrics,
                      train_dataset=ds["train"],
                      eval_dataset=ds["valid"])
trainer.train()

  trainer = Trainer(model=model, tokenizer=tokenizer,


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 3.94 GiB of which 11.12 MiB is free. Including non-PyTorch memory, this process has 3.75 GiB memory in use. Of the allocated memory 3.56 GiB is allocated by PyTorch, and 111.40 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)