In [None]:
# default_exp data.core

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.core

> This module contains the core bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data in a way modelable by huggingface transformer implementations.

In [None]:
#export
from functools import reduce

import torch, nlp
from transformers import *
from fastai.text.all import *

from blurr.utils import *

In [None]:
#hide
import pdb

from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#cuda
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


## Base tokenization, batch transform, and DataBlock methods

In [None]:
#export
class HF_TokenizerTransform(ItemTransform):
    """huggingface friendly tokenization transform."""
    def __init__(self, hf_arch, hf_tokenizer, 
                 max_length=None, padding=True, truncation=True, is_pretokenized=False, **kwargs):
        
        # gpt2, roberta, bart (and maybe others) tokenizers require a prefix space
        if (hasattr(hf_tokenizer, 'add_prefix_space')): kwargs['add_prefix_space'] = True
        
        store_attr(self, 'hf_arch, hf_tokenizer, is_pretokenized, max_length, padding, truncation')
        store_attr(self, 'kwargs')
        
    def encodes(self, inp): 
        """Supports passing in one or two input sequences, or a list[str] (the later is common for token 
        classification tasks where you should also set `is_pretokenized=True`).
        Returns all the tensors for the input sequence(s) in a dictionary."""
        inps = [inp, None] if (isinstance(inp, str) or self.is_pretokenized) else inp

        res = self.hf_tokenizer(inps[0], inps[1],
                                max_length=self.max_length,
                                padding=self.padding,
                                truncation=self.truncation,
                                is_pretokenized=self.is_pretokenized,
                                return_tensors='pt', 
                                **self.kwargs)

        for k in res.keys(): res[k] = res[k].squeeze(0)
        return res
    
    def decodes(self, encoded_inp): 
        """Returns the first item of the list `encoded_inp`; this should be the 'input_ids'."""
        input_ids = filter(lambda el: el != self.hf_tokenizer.pad_token_id, encoded_inp[0].cpu().numpy())
        decoded_input = self.hf_tokenizer.decode(input_ids, skip_special_tokens=True)
        return TitledStr(decoded_input)
    

`HF_TokenizerTransform` was inspired by [this article](http://dev.fast.ai/tutorial.transformers).  It handles both the tokenization and numericalization traditionally split apart in the fastai text DataBlock API.  For huggingface tokenizers that require a prefix space, it will be included automatically.

You can pass a string or list into this Transform, the later being for tasks that require two input sequneces (e.g. question answer tasks for example require a "context" and a "question" sequence).

In order to make the tokenization/numericalization process more efficient, this transform has been updated to return a `transformers.tokenization_utils_base.BatchEncoding` dictionary with all the required transformer inputs (e.g. input_ids, attention_mask, etc...). Previously, it only returned the raw input_ids for *each* sequence which were then put together to come up with all the required inputs and padding in a `before_batch` transform.

In [None]:
#export
class HF_BaseInput(list): pass

A `HF_BaseInput` object is returned from the `decodes` method of `HF_BatchTransform` as a mean to customize @typedispatched functions like `DataLoaders.show_batch` and `Learner.show_results`.  It encapsulates a list with *one* item, the input_ids for the sequence.

In [None]:
#export
class HF_BatchTransform(Transform):
    """Handles everything you need to assemble a mini-batch of inputs and targets, as well as decode
    HF_TokenizerTransform inputs
    """
    def __init__(self, hf_arch, hf_tokenizer, hf_input_return_type=HF_BaseInput, **kwargs):
        store_attr(self, 'hf_arch, hf_tokenizer, hf_input_return_type, kwargs')
        
    def encodes(self, samples): return samples
    
    def decodes(self, encoded_samples):
        if (isinstance(encoded_samples, dict)): return self.hf_input_return_type([encoded_samples['input_ids']])
        return encoded_samples

In [None]:
#export
def pad_hf_inputs(samples, arch, hf_input_idxs=[0], pad_idx=0, pad_first=False):
    """
    Add this to your batch transforms if you are using dynamic padding with `HF_TokenizerTransform`
    (e.g., padding is set to anything except 'max_length') to ensure all HF tensors are sized to longest input
    in the batch.
    
    Note: This is automatically included as necessary by `HF_TextBlock`
    """
    for hf_input_idx in hf_input_idxs:
        if (hf_input_idx >= len(samples[0])): continue
            
        inp_keys = samples[0][hf_input_idx].keys()
        max_len = np.max([len(s[hf_input_idx]["input_ids"]) for s in samples])

        for idx, sample in enumerate(samples):
            for key in inp_keys:
                if (key == 'input_ids'): tok_id = pad_idx
                elif (key == 'special_tokens_mask'): tok_id=1
                elif (arch == 'xlnet' and key == 'token_type_ids'): tok_id = 3
                else: tok_id = 0

                if (pad_first):
                    new_val = torch.cat((sample[hf_input_idx][key].new_full((max_len,), tok_id), sample[hf_input_idx][key]), dim=0)[-max_len:]
                else:
                    new_val = torch.cat((sample[hf_input_idx][key], sample[hf_input_idx][key].new_full((max_len,), tok_id)), dim=0)[:max_len]
                samples[idx][hf_input_idx][key] = new_val[:max_len]

    return samples

In [None]:
show_doc(pad_hf_inputs)

<h4 id="pad_hf_inputs" class="doc_header"><code>pad_hf_inputs</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>pad_hf_inputs</code>(**`samples`**, **`arch`**, **`hf_input_idxs`**=*`[0]`*, **`pad_idx`**=*`0`*, **`pad_first`**=*`False`*)

Add this to your batch transforms if you are using dynamic padding with [`HF_TokenizerTransform`](/blurr/data-core#HF_TokenizerTransform)
(e.g., padding is set to anything except 'max_length') to ensure all HF tensors are sized to longest input
in the batch.

Note: This is automatically included as necessary by [`HF_TextBlock`](/blurr/data-core#HF_TextBlock)

In [None]:
#export
class HF_TextBlock(TransformBlock):
    def __init__(self, hf_arch, hf_tokenizer, 
                 hf_tok_tfm=None, max_length=512, padding=True, truncation=True, is_pretokenized=False,
                 hf_batch_tfm=None, hf_input_return_type=HF_BaseInput, hf_input_idxs=[0],
                 dl_type=SortedDL, tok_kwargs={}, batch_kwargs={}, **kwargs):
        
        if (hf_tok_tfm is None): 
            hf_tok_tfm = HF_TokenizerTransform(hf_arch, hf_tokenizer, max_length, 
                                               padding, truncation, is_pretokenized, **tok_kwargs)
            
        if (hf_batch_tfm is None): 
            hf_batch_tfm = HF_BatchTransform(hf_arch, hf_tokenizer, hf_input_return_type, **batch_kwargs)
            
        pad_fn = noop
        if padding != 'max_length':
            pad_fn = partial(pad_hf_inputs, 
                             arch=hf_arch, 
                             hf_input_idxs=hf_input_idxs,
                             pad_idx=hf_tokenizer.pad_token_id, 
                             pad_first=hf_tokenizer.padding_side=='left')
            
        return super().__init__(type_tfms=hf_tok_tfm, 
                                dl_type=dl_type, 
                                dls_kwargs={ 'before_batch': [pad_fn, hf_batch_tfm] })            

`HF_TextBlock` has been dramatically simplified from it's predecessor.  It handles setting up your `HF_TokenizerTransform` and `HF_BatchTransform` transform regardless of data source (e.g., this will work with files, DataFrames, whatever).

In [None]:
#export
@typedispatch
def show_batch(x:HF_BaseInput, y, samples, dataloaders=None, ctxs=None, max_n=6, **kwargs):  
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)

    display_df(pd.DataFrame(ctxs))
    return ctxs

## Sequence classification

Below demonstrates how to contruct your `DataBlock` for a sequence classification task (e.g., a model that requires a single text input)

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)

model_path = Path('models')
imdb_df = pd.read_csv(path/'texts.csv')

In [None]:
imdb_df.head()

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False
2,negative,"Every once in a long while a movie will come along that will be so awful that I feel compelled to warn people. If I labor all my days and I can save but one soul from watching this movie, how great will be my joy.<br /><br />Where to begin my discussion of pain. For starters, there was a musical montage every five minutes. There was no character development. Every character was a stereotype. We had swearing guy, fat guy who eats donuts, goofy foreign guy, etc. The script felt as if it were being written as the movie was being shot. The production value was so incredibly low that it felt li...",False
3,positive,"Name just says it all. I watched this movie with my dad when it came out and having served in Korea he had great admiration for the man. The disappointing thing about this film is that it only concentrate on a short period of the man's life - interestingly enough the man's entire life would have made such an epic bio-pic that it is staggering to imagine the cost for production.<br /><br />Some posters elude to the flawed characteristics about the man, which are cheap shots. The theme of the movie ""Duty, Honor, Country"" are not just mere words blathered from the lips of a high-brassed offic...",False
4,negative,"This movie succeeds at being one of the most unique movies you've seen. However this comes from the fact that you can't make heads or tails of this mess. It almost seems as a series of challenges set up to determine whether or not you are willing to walk out of the movie and give up the money you just paid. If you don't want to feel slighted you'll sit through this horrible film and develop a real sense of pity for the actors involved, they've all seen better days, but then you realize they actually got paid quite a bit of money to do this and you'll lose pity for them just like you've alr...",False


There are a bunch of ways we can get at the four huggingface elements we need (e.g., architecture name, tokenizer, config, and model).  We can just create them directly, or we can use one of the helper methods available via `BLURR_MODEL_HELPER`.

In [None]:
#hide_output
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base" # "distilbert-base-uncased" "bert-base-uncased"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, task=task)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

Once you have those elements, you can create your `DataBlock` as simple as the below.

In [None]:
# single input
blocks = (HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), CategoryBlock)

dblock = DataBlock(blocks=blocks, 
                   get_x=ColReader('text'), 
                   get_y=ColReader('label'), 
                   splitter=ColSplitter(col='is_valid'))

In [None]:
# dblock.summary(imdb_df)

In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)

In [None]:
b = dls.one_batch(); len(b), len(b[0]['input_ids']), b[0]['input_ids'].shape, len(b[1]) 

(2, 4, torch.Size([4, 273]), 4)

Let's take a look at the actual types represented by our batch

In [None]:
explode_types(b)

{tuple: [dict, fastai.torch_core.TensorCategory]}

In [None]:
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",negative
1,"The only explanation I can muster as to why this film isn't widely distributed is because it hits too close to home for some. This movie was a genuine happy surprise, the satire is genius. This film turns the lights on in the dark that is organized religion and big media, and the roaches scurry for cover. Rent the DVD and watch it for yourself if you haven't yet, this film succeeds where many have failed (Dogma comes to mind) to poke it's nose under the tent, both by using humor and very clever analogies coupled with telling backdrops and locations. Can't comment in depth without revealing some significant spoilers, there are some surprises in this film which even the seasoned film buff will be caught off guard by.",positive


## Tests

The tests below to ensure the core DataBlock code above works for **all** pretrained sequence classification models available in huggingface.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained classification models you are working with ... and if any of your pretrained sequence classification models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
BLURR_MODEL_HELPER.get_models(task='SequenceClassification')

[transformers.modeling_albert.AlbertForSequenceClassification,
 transformers.modeling_auto.AutoModelForSequenceClassification,
 transformers.modeling_bart.BartForSequenceClassification,
 transformers.modeling_bert.BertForSequenceClassification,
 transformers.modeling_camembert.CamembertForSequenceClassification,
 transformers.modeling_distilbert.DistilBertForSequenceClassification,
 transformers.modeling_electra.ElectraForSequenceClassification,
 transformers.modeling_flaubert.FlaubertForSequenceClassification,
 transformers.modeling_longformer.LongformerForSequenceClassification,
 transformers.modeling_mobilebert.MobileBertForSequenceClassification,
 transformers.modeling_roberta.RobertaForSequenceClassification,
 transformers.modeling_xlm.XLMForSequenceClassification,
 transformers.modeling_xlm_roberta.XLMRobertaForSequenceClassification,
 transformers.modeling_xlnet.XLNetForSequenceClassification]

In [None]:
pretrained_model_names = [
    'albert-base-v1',
    'facebook/bart-base',
    'bert-base-uncased',
    'camembert-base',
    'distilbert-base-uncased',
    'monologg/electra-small-finetuned-imdb',
    'flaubert/flaubert_small_cased', 
    'allenai/longformer-base-4096',
    'google/mobilebert-uncased',
    'roberta-base',
    'xlm-mlm-en-2048',
    'xlm-roberta-base',
    'xlnet-base-cased'
]

In [None]:
# for model_name in pretrained_model_names:
#     tok = AutoTokenizer.from_pretrained(model_name)
#     print(f'=== {model_name} ===')
#     print(f'=== {tok.padding_side} ===')
#     print(f'=== {tok.pad_token_id} ===')
#     print(tok(['hi', 'hello everyone. its good to be here'], ['yo', 'yo'], padding='max_length', max_length=128))

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)

model_path = Path('models')
imdb_df = pd.read_csv(path/'texts.csv')

In [None]:
#slow
#hide_output
task = HF_TASKS_AUTO.SequenceClassification
test_results = []

for model_name in pretrained_model_names:
    error=None
    
    print(f'=== {model_name} ===\n')
    
    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(model_name, task=task)
    
    print(f'architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n')
    
    blocks = (HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer, max_length=128), CategoryBlock)

    dblock = DataBlock(blocks=blocks, 
                       get_x=ColReader('text'), 
                       get_y=ColReader('label'), 
                       splitter=ColSplitter(col='is_valid'))
    
    dls = dblock.dataloaders(imdb_df, bs=4) 
    b = dls.one_batch()
    
    try:
        print('*** TESTING DataLoaders ***\n')
        test_eq(len(b), 2)
        test_eq(len(b[0]['input_ids']), 4)
        test_eq(b[0]['input_ids'].shape, torch.Size([4, 128]))
        test_eq(len(b[1]), 4)

        if (hasattr(hf_tokenizer, 'add_prefix_space')):
            test_eq(dls.tfms[0].kwargs['add_prefix_space'], True)
            
        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, 'PASSED', ''))
        dls.show_batch(dataloaders=dls, max_n=2)
        
    except Exception as err:
        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, 'FAILED', err))

=== albert-base-v1 ===



Some weights of the model checkpoint at albert-base-v1 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

architecture:	albert
tokenizer:	AlbertTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,category
0,"un-bleeping-believable! meg ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. hard to believe she was the producer on this dog. plus kevin kline: what kind of suicide trip has his career been on? whoosh... banzai!!! finally this was directed by the guy who did big chill? must be a replay of jonestown - hollywood style. wooofff!",negative
1,"a wonderful and gritty war film that focuses on the inner torment of blinded marine al schmid. although it is tough and unpleasant it is in the end heroic - schmid's triumph over disability and depression. the battle scene was superb. but one bone to pick. no matter how many.50 bullets they fired i never saw any water or dirt being kicked up by the impacts! it hurt the realism, but i can live with it. fine performance by eleanor parker, again, as his girl friend.",positive


=== facebook/bart-base ===



Some weights of the model checkpoint at facebook/bart-base were not used when initializing BartForSequenceClassification: ['final_logits_bias']
- This IS expected if you are initializing BartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BartForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.weight', 'classification_head.dense.bias', 'classification_head.out_proj.weight', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to u

architecture:	bart
tokenizer:	BartTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",negative
1,"I grew up Baptist and I know the story this movie is trying to tell, although I no longer believe the story. I'll give the movie kudos for being as good as the average Lifetime Movie of the Week. Mildly interesting, mediocre acting, a bit slow, the script is predictable, the music is sappy, and it is a bit melodramatic. And all the people left behind have got to be the squeakiest clean non-Christians, ever. Not a single curse word from any of them. But I laughed out loud when the actor playing the man who runs the United Nations pronounced """,negative


=== bert-base-uncased ===



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

architecture:	bert
tokenizer:	BertTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,category
0,"un - bleeping - believable! meg ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. hard to believe she was the producer on this dog. plus kevin kline : what kind of suicide trip has his career been on? whoosh... banzai!!! finally this was directed by the guy who did big chill? must be a replay of jonestown - hollywood style. wooofff!",negative
1,"i was gifted with this movie as it had such a great premise, the friendship of three women bespoiled by one falling in love with a younger man. < br / > < br / > intriguing. < br / > < br / > not! i hasten to add. these women are all drawn in extreme caricature, not very supportive of one another and conspiring and contriving to bring each other down. < br / > < br / > anna chancellor and imelda staunton could do no wrong in my book prior to seeing this, but here they are handed a di",negative


=== camembert-base ===



Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

architecture:	camembert
tokenizer:	CamembertTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown -",negative
1,"I was gifted with this movie as it had such a great premise, the friendship of three women bespoiled by one falling in love with a younger man.<br /><br />Intriguing.<br /><br />NOT! I hasten to add. These women are all drawn in extreme caricature, not very supportive of one another and conspiring and contriving to bring each other down.<br /><br />Anna Chancell",negative


=== distilbert-base-uncased ===



Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

architecture:	distilbert
tokenizer:	DistilBertTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,category
0,"un - bleeping - believable! meg ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. hard to believe she was the producer on this dog. plus kevin kline : what kind of suicide trip has his career been on? whoosh... banzai!!! finally this was directed by the guy who did big chill? must be a replay of jonestown - hollywood style. wooofff!",negative
1,"a woman asks for advice on the road to reach a mysterious town, and hears two ghoulish stories from the local weirdo, both zombie related. but perhaps fate has something nasty in store for her too... < br / > < br / > the zombie chronicles is absolutely one of the worst films i have ever seen. in fact i must confess, so bad was it i fast forwarded through most of the garbage. and there was a lot of that, believe me. it runs for just 69 minutes, and there is still tons of filler. you get some skinhead doing a lot of push",negative


=== monologg/electra-small-finetuned-imdb ===



Some weights of the model checkpoint at monologg/electra-small-finetuned-imdb were not used when initializing ElectraForSequenceClassification: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/electra-small-finetuned-imdb and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a

architecture:	electra
tokenizer:	ElectraTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,category
0,"un - bleeping - believable! meg ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. hard to believe she was the producer on this dog. plus kevin kline : what kind of suicide trip has his career been on? whoosh... banzai!!! finally this was directed by the guy who did big chill? must be a replay of jonestown - hollywood style. wooofff!",negative
1,dig! i would say to anyone even if you don't like metallica to see'some kind of monster'it is a spinal tap type documentary about one of the biggest bands in the world acting like mental kids during a breakdown of sorts. it's fun and fascinating. along the same lines comes dig! a film about'the dandy warhol's'and'the brian jonestown massacre'two portland bands who start off a kind of music scene in there home town only for one of the bands to become huge and one to fall by the wayside into the musical history books. right from the start,positive


=== flaubert/flaubert_small_cased ===



Some weights of the model checkpoint at flaubert/flaubert_small_cased were not used when initializing FlaubertForSequenceClassification: ['pred_layer.proj.bias', 'pred_layer.proj.weight']
- This IS expected if you are initializing FlaubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing FlaubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_small_cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for p

architecture:	flaubert
tokenizer:	FlaubertTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn' t even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline : what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",negative
1,"This was the first movie I ever saw Ashley Judd in and the first film of Victor Nunez'that I ever say, and boy am I glad I did. Its'quiet tone, its'relaxed pace, its'realistic depiction of a young woman just starting out in life, its'fine depiction of the struggles she has to go through to make her mark in life, the decisions she makes based on real things, the people she meets - there is nothing wrong with this movie. It is",positive


=== allenai/longformer-base-4096 ===



Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', '

architecture:	longformer
tokenizer:	LongformerTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",negative
1,"I have always liked Spike Lee's movies, but this one was a total waste of 2 1/2 hours. I expected more about Son of Sam and instead got a movie that seemed to have very little to do with the 1977 serial killings. The talking dog was laughable (you know you're in trouble when all the movie patrons burst into laughter inappropriately). The whole movie seemed very disjointed and not very interesting. The sex scenes were totally irrelevent to the plot. I'm not opposed to sex in movies, but it should have some point (unless it's a XXX movie). All in all, we were very",negative


=== google/mobilebert-uncased ===



Some weights of the model checkpoint at google/mobilebert-uncased were not used when initializing MobileBertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MobileBertForSequence

architecture:	mobilebert
tokenizer:	MobileBertTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,category
0,"un - bleeping - believable! meg ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. hard to believe she was the producer on this dog. plus kevin kline : what kind of suicide trip has his career been on? whoosh... banzai!!! finally this was directed by the guy who did big chill? must be a replay of jonestown - hollywood style. wooofff!",negative
1,i have seen this movie and the other one. trinity is my name and i find that this one is worse then the first one. i have no idea why they even made another movie it was stupid and pointless sorry to say that i have all of them. i have sat through them number of times and it still drives me to turn it off 5 minutes into the movie. i like terence hill movies and i like bud spencer but this movie just drove me up the wall. if it had a different story line or at least more of a plot and more comedy it might have been funner and worth the 5 dollars i spent buying,negative


=== roberta-base ===



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

architecture:	roberta
tokenizer:	RobertaTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",negative
1,"Saw this as a young naive punk when it was first released. Had me snifflin' like a baby as I left the theatre, trying not to let anyone see. So, when I saw it again now in '07, I knew what to expect & the sobs were ready & primed as their required moment approached. Thankfully this time I was at home.<br /><br />What I hadn't remembered from my youthful viewing- or perhaps hadn't noticed because of it, was the technical brilliance of this movie. The use of flashbacks which tell so much story without resorting to dialogue. The camera work which seemed to",positive


=== xlm-mlm-en-2048 ===



Some weights of the model checkpoint at xlm-mlm-en-2048 were not used when initializing XLMForSequenceClassification: ['pred_layer.proj.weight', 'pred_layer.proj.bias']
- This IS expected if you are initializing XLMForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLMForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMForSequenceClassification were not initialized from the model checkpoint at xlm-mlm-en-2048 and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


architecture:	xlm
tokenizer:	XLMTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,category
0,"un-bleeping-believable! meg ryan doesn 't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. hard to believe she was the producer on this dog. plus kevin kline : what kind of suicide trip has his career been on? whoosh... banzai!!! finally this was directed by the guy who did big chill? must be a replay of jonestown - hollywood style. wooofff!",negative
1,"a bit of trivia b / c i can 't figure out how to submit trivia : in the backdrop of this performance, one of the images is < br / > < br / > george serat's "" a sunday afternoon on the island of la grande jatte "" painting ( seen best in chapter 18 ), this painting is the subject of a sonheim musical sunday in the park with george. < br / > < br / > a bit of trivia b / c i can 't figure out how to submit trivia : in the backdrop of this performance, one of the images is < br / >",positive


=== xlm-roberta-base ===



Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weig

architecture:	xlm_roberta
tokenizer:	XLMRobertaTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",negative
1,"I was fortunate enough to meet George Pal (and still have my DS:TMOB poster autographed by him) at a convention shortly after the release, and asked him why he chose to do the film ""camp"". Before he could answer, two studio flacks intercepted and lectured me on how the studio ""knew best"" and how ""no one will take such a film seriously"". I had been reading the Bantam reprints for a couple of years thanks to a friend (ComiCon attendees of the 1970s will recall Blackhawk and his band? I was in a",negative


=== xlnet-base-cased ===



Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

architecture:	xlnet
tokenizer:	XLNetTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",negative
1,"Somebody needs to send this Uli Lommel guy back to MOVIE SCHOOL. Who ever told him HE knew HOW to make a movie? Can just ANYBODY make movies these days? In the past, it always REQUIRED TALENT before someone could make a movie. After watching this lame BTK movie and the others he's made, it seems blatantly obvious that the poor guy has about as much business making movies as I DO. Actually I think even I could make better movies than Uli LAME-ALL. This movie has absolutely NOTHING to do with",negative


In [None]:
#slow
#hide_input
test_results_df = pd.DataFrame(test_results, columns=['arch', 'tokenizer', 'model_name', 'result', 'error'])
display_df(test_results_df)

Unnamed: 0,arch,tokenizer,model_name,result,error
0,albert,AlbertTokenizer,albert-base-v1,PASSED,
1,bart,BartTokenizer,facebook/bart-base,PASSED,
2,bert,BertTokenizer,bert-base-uncased,PASSED,
3,camembert,CamembertTokenizer,camembert-base,PASSED,
4,distilbert,DistilBertTokenizer,distilbert-base-uncased,PASSED,
5,electra,ElectraTokenizer,monologg/electra-small-finetuned-imdb,PASSED,
6,flaubert,FlaubertTokenizer,flaubert/flaubert_small_cased,PASSED,
7,longformer,LongformerTokenizer,allenai/longformer-base-4096,PASSED,
8,mobilebert,MobileBertTokenizer,google/mobilebert-uncased,PASSED,
9,roberta,RobertaTokenizer,roberta-base,PASSED,


## Example: Multi-label classification

Below demonstrates how to contruct your `DataBlock` for a multi-label classification task

In [None]:
# creates a dataset with the first 10% of training set
raw_data = nlp.load_dataset('civil_comments', split='train[:1%]') 
len(raw_data)

Using custom data configuration default


18049

In [None]:
toxic_df = pd.DataFrame(raw_data, columns=list(raw_data.features.keys()))
toxic_df.head()

Unnamed: 0,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!",0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyone get in your way!",0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,This is such an urgent design problem; kudos to you for taking it on. Very impressive!,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Is this something I'll be able to install on my site? When will you be releasing it?,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,haha you guys are a bunch of losers.,0.893617,0.021277,0.0,0.0,0.87234,0.021277,0.0


In [None]:
lbl_cols = list(toxic_df.columns[1:]); lbl_cols

['toxicity',
 'severe_toxicity',
 'obscene',
 'threat',
 'insult',
 'identity_attack',
 'sexual_explicit']

In [None]:
toxic_df = toxic_df.round({col: 0 for col in lbl_cols})
toxic_df.head()

Unnamed: 0,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!",0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyone get in your way!",0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,This is such an urgent design problem; kudos to you for taking it on. Very impressive!,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Is this something I'll be able to install on my site? When will you be releasing it?,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,haha you guys are a bunch of losers.,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
#hide_output
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base" # "distilbert-base-uncased" "bert-base-uncased"
n_labels = len(lbl_cols)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, 
                                                                               task=task, 
                                                                               config_kwargs={'num_labels': n_labels})

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

In [None]:
# single input
blocks = (HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), MultiCategoryBlock(encoded=True, vocab=lbl_cols))

dblock = DataBlock(blocks=blocks, 
                   get_x=ColReader('text'), 
                   get_y=ColReader(lbl_cols), 
                   splitter=RandomSplitter())

In [None]:
dls = dblock.dataloaders(toxic_df, bs=4)

In [None]:
b = dls.one_batch()
len(b), b[0]['input_ids'].shape, b[1].shape

(2, torch.Size([4, 170]), torch.Size([4, 7]))

In [None]:
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,None
0,"looking at it...trying to understand how this profound idiot is dominating this election...bottom line...it's because he pisses off the PC left, close enough.",identity_attack;sexual_explicit
1,"It takes volunteers and some local 4 to put on events like these... do you realize how few volunteers there are who take on most of the community projects in Anchorage? Everyone complaining might well think about volunteering themselves! A few more volunteers in any local endeavor always makes a big difference in enjoyment, burdens and successes.",


## Cleanup

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01a_data-language-modeling.ipynb.
Converted 01c_data-question-answering.ipynb.
Converted 01d_data-token-classification.ipynb.
Converted 01e_data-summarization.ipynb.
Converted 02_modeling-core.ipynb.
Converted 02a_modeling-language-modeling.ipynb.
Converted 02c_modeling-question-answering.ipynb.
Converted 02d_modeling-token-classification.ipynb.
Converted 02e_modeling-summarization.ipynb.
Converted index.ipynb.
