In [None]:
# default_exp examples.multilabel_classification


In [None]:
# all_slow



In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Multi-label classification

> This is an example of how to use blurr for multilabel classification tasks using both the mid and high level Blurr API

In [None]:
# export
import os

import datasets
from transformers import *
from fastai.text.all import *
from fastai.callback.hook import _print_shapes

from blurr.utils import *
from blurr.data.core import *
from blurr.modeling.core import *

logging.set_verbosity_error()


In [None]:
# hide_input
import pdb

from fastcore.test import *
from nbverbose.showdoc import show_doc

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("Here's what we're running with ...\n")
print_versions("torch fastai transformers")


In [None]:
# cuda
# hide
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")


Let's start by building our `DataBlock`

In [None]:
raw_datasets = datasets.load_dataset("civil_comments")


In [None]:
# --- Option 1: Experimental subset (using 10k training examples) ---
raw_train_df = raw_datasets["train"].select(range(10000)).to_pandas()
raw_valid_df = raw_datasets["validation"].select(range(2000)).to_pandas()

# --- Option 2: Full dataset (using the predefined training and validation sets) ---
# raw_train_df = pd.DataFrame(raw_datasets['train'], columns=list(raw_datasets['train'].features.keys()))
# raw_valid_df = pd.DataFrame(raw_datasets['validation'], columns=list(raw_datasets['validation'].features.keys()))

raw_train_df["is_valid"] = False
raw_valid_df["is_valid"] = True

toxic_df = pd.concat([raw_train_df, raw_valid_df])
print(len(toxic_df))


In [None]:
toxic_df.head()


In [None]:
lbl_cols = ["identity_attack", "insult", "obscene", "toxicity", "severe_toxicity", "sexual_explicit", "threat"]
lbl_cols


In [None]:
toxic_df = toxic_df.round({col: 0 for col in lbl_cols})
toxic_df = toxic_df.convert_dtypes()

toxic_df.head()


## Mid-level API

For our huggingface model, let's used the distilled version of RoBERTa. This should allow us to train the model on bigger mini-batches without much performance loss.  Even on my 1080Ti, I should be able to train all the parameters (which isn't possible with the `roberta-base` model)

In [None]:
model_cls = AutoModelForSequenceClassification

pretrained_model_name = "distilroberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)
config.num_labels = len(lbl_cols)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls, config=config)
hf_model.config.problem_type = "multi_label_classification"

print(hf_arch)
print(type(hf_config))
print(type(hf_tokenizer))
print(type(hf_model))


Note how we have to configure the `num_labels` to the number of labels we are predicting. Given that our labels are already encoded, we use a `MultiCategoryBlock` with encoded=True and vocab equal to the columns with our 1's and 0's.

In [None]:
blocks = (TextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), MultiCategoryBlock(encoded=True, vocab=lbl_cols))

dblock = DataBlock(blocks=blocks, get_x=ColReader("text"), get_y=ColReader(lbl_cols), splitter=ColSplitter())



In [None]:
dls = dblock.dataloaders(toxic_df, bs=4, val_bs=8)


In [None]:
b = dls.one_batch()
len(b), b[0]["input_ids"].shape, b[1].shape

With our DataLoaders built, we can now build our `Learner` and train.  We'll use mixed precision so we can train with bigger batches

In [None]:
model = BaseModelWrapper(hf_model)

learn = Learner(
    dls,
    model,
    opt_func=partial(Adam),
    loss_func=PreCalculatedBCELoss(), # BCEWithLogitsLossFlat(),
    metrics=[partial(accuracy_multi, thresh=0.2)],
    cbs=[BaseModelCallback],
    splitter=blurr_splitter,
).to_fp16()

learn.loss_func.thresh = 0.2
learn.freeze()


In [None]:
# learn.summary()


In [None]:
preds = model(b[0])
preds.logits.shape, preds


In [None]:
learn.lr_find()


In [None]:
learn.fit_one_cycle(1, lr_max=1e-2)


In [None]:
learn.unfreeze()


In [None]:
learn.fit_one_cycle(1, lr_max=slice(1e-8, 1e-4))


In [None]:
learn.show_results(learner=learn, max_n=2)


In [None]:
learn.loss_func.thresh = 0.02


In [None]:
comment = """
Those damned affluent white people should only eat their own food, like cod cakes and boiled potatoes. 
No enchiladas for them!
"""
learn.blurr_predict(comment)


In [None]:
preds, targs = learn.get_preds(with_loss=False)
preds.shape, targs.shape


## High-level API

With the high-level API, we can create our DataBlock, DataLoaders, and Blearner in one line of code

In [None]:
# hide
try:
    del learn
    torch.cuda.empty_cache()
except:
    pass



In [None]:
learn = BlearnerForSequenceClassification.from_dataframe(
    toxic_df, pretrained_model_name, text_attr="text", label_attr=lbl_cols, dl_kwargs={"bs": 4}
)

learn.loss_func.thresh = 0.02


In [None]:
learn.fit_one_cycle(1, lr_max=1e-2)


In [None]:
learn.show_results(learner=learn, max_n=2)


In [None]:
comment = """
Those damned affluent white people should only eat their own food, like cod cakes and boiled potatoes. 
No enchiladas for them!
"""
learn.blurr_predict(comment)


In [None]:
preds, targs, losses = learn.get_preds(with_loss=True)
preds.shape, targs.shape, losses.shape


## Low-level API

In [None]:
raw_datasets = datasets.load_dataset("civil_comments", split=["train[:1000]", "validation[:500]"])
raw_datasets


We'll create a `labels` column that includes the OHE labels for each example. The raw values come in the form of probabilities ranging from 0. to 1., so we simply round those >= .51 to 1.0, else set it to 0. for our purposes here

In [None]:
def tokenize_function(example):
    inputs = hf_tokenizer(example["text"], truncation=True)
    targets = [
        float(round(example[lbl])) for lbl in ["identity_attack", "insult", "obscene", "severe_toxicity", "sexual_explicit", "threat", "toxicity"]
    ]
    return {**inputs, **{"labels": targets}}


tokenized_datasets = [ds.map(tokenize_function, batched=False) for ds in raw_datasets]



By assigning the aforementioned labels to the `label_names` argument of our `BlurrDataLoader`s, we get the friendly label printed when we run `show_batch` or `show_results` intead of the label's index.

In [None]:
label_names = ["identity_attack", "insult", "obscene", "severe_toxicity", "sexual_explicit", "threat", "toxicity"]

trn_dl = BlurrDataLoader(
    tokenized_datasets[0],
    hf_arch=hf_arch,
    hf_config=hf_config,
    hf_tokenizer=hf_tokenizer,
    hf_model=hf_model,
    preproccesing_func=preproc_hf_dataset,
    batch_tfm_kwargs={"labels": label_names},
    shuffle=True,
    batch_size=8,
)

val_dl = BlurrDataLoader(
    tokenized_datasets[1],
    hf_arch=hf_arch,
    hf_config=hf_config,
    hf_tokenizer=hf_tokenizer,
    hf_model=hf_model,
    preproccesing_func=preproc_hf_dataset,
    batch_tfm_kwargs={"labels": label_names},
    batch_size=16,
)

dls = DataLoaders(trn_dl, val_dl)

In [None]:
b = dls.one_batch()
b[0]["input_ids"].shape, b[1].shape

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)


In [None]:
model = BaseModelWrapper(hf_model)

learn = Learner(
    dls,
    model,
    opt_func=partial(Adam),
    loss_func=BCEWithLogitsLossFlat(),
    metrics=[partial(accuracy_multi, thresh=0.2)],
    cbs=[BaseModelCallback],
    splitter=blurr_splitter,
).to_fp16()

learn.loss_func.thresh = 0.2
learn.freeze()

In [None]:
learn.fit(1, 1e-2)

In [None]:
learn.show_results(learner=learn, max_n=2)

## Summary

If your sequence classification model isn't training, make sure you have set the `num_labels` correctly (95% of the time this is the culprit).  And with this example, you can see that Blurr can make both your multiclassification and multilabel classification tasks a breeze.

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()
