In [None]:
# default_exp data.core

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.core

> This module contains the core bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data in a way modelable by huggingface transformer implementations.

In [None]:
#export
from functools import reduce

import torch, nlp
from transformers import *
from fastai2.text.all import *

from blurr.utils import *

In [None]:
#hide
import pdb

from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#cuda
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


## Base tokenization, batch transform, and DataBlock methods

In [None]:
#export
class HF_TokenizerTransform(ItemTransform):
    """huggingface friendly tokenization transform."""
    def __init__(self, hf_arch, hf_tokenizer, 
                 max_length=None, padding='max_length', truncation=True, is_pretokenized=False, **kwargs):
        
        # gpt2, roberta, bart (and maybe others) tokenizers require a prefix space
        if (hasattr(hf_tokenizer, 'add_prefix_space')): kwargs['add_prefix_space'] = True
        
        store_attr(self, 'hf_arch, hf_tokenizer, is_pretokenized, max_length, padding, truncation')
        store_attr(self, 'kwargs')
        
    def encodes(self, inp): 
        """Supports passing in one or two input sequences, or a list[str] (the later is common for token 
        classification tasks where you should also set `is_pretokenized=True`).
        Returns all the tensors for the input sequence(s) in a dictionary."""
        inps = [inp, None] if (isinstance(inp, str) or self.is_pretokenized) else inp

        res = self.hf_tokenizer(inps[0], inps[1],
                                max_length=self.max_length,
                                padding=self.padding,
                                truncation=self.truncation,
                                is_pretokenized=self.is_pretokenized,
                                return_tensors='pt', 
                                **self.kwargs)

        for k in res.keys(): res[k] = res[k].squeeze(0)
        return res
    
    def decodes(self, encoded_inp): 
        """Returns the first item of the list `encoded_inp`; this should be the 'input_ids'."""
        input_ids = filter(lambda el: el != self.hf_tokenizer.pad_token_id, encoded_inp[0].cpu().numpy())
        decoded_input = self.hf_tokenizer.decode(input_ids, skip_special_tokens=True)
        return TitledStr(decoded_input)
    

`HF_TokenizerTransform` was inspired by [this article](http://dev.fast.ai/tutorial.transformers).  It handles both the tokenization and numericalization traditionally split apart in the fastai text DataBlock API.  For huggingface tokenizers that require a prefix space, it will be included automatically.

You can pass a string or list into this Transform, the later being for tasks that require two input sequneces (e.g. question answer tasks for example require a "context" and a "question" sequence).

In order to make the tokenization/numericalization process more efficient, this transform has been updated to return a `transformers.tokenization_utils_base.BatchEncoding` dictionary with all the required transformer inputs (e.g. input_ids, attention_mask, etc...). Previously, it only returned the raw input_ids for *each* sequence which were then put together to come up with all the required inputs and padding in a `before_batch` transform.

In [None]:
#export
class HF_BaseInput(list): pass

A `HF_BaseInput` object is returned from the `decodes` method of `HF_BatchTransform` as a mean to customize @typedispatched functions like `DataLoaders.show_batch` and `Learner.show_results`.  It encapsulates a list with *one* item, the input_ids for the sequence.

In [None]:
#export
class HF_BatchTransform(Transform):
    """Handles everything you need to assemble a mini-batch of inputs and targets, as well as decode
    HF_TokenizerTransform inputs
    """
    def __init__(self, hf_arch, hf_tokenizer, hf_input_return_type=HF_BaseInput, **kwargs):
        store_attr(self, 'hf_arch, hf_tokenizer, hf_input_return_type, kwargs')
        
    def encodes(self, samples): return samples
    
    def decodes(self, encoded_samples):
        if (isinstance(encoded_samples, dict)): return self.hf_input_return_type([encoded_samples['input_ids']])
        return encoded_samples

In [None]:
#export
class HF_TextBlock(TransformBlock):
    def __init__(self, hf_arch, hf_tokenizer, 
                 hf_tok_tfm=None, max_length=512, padding='max_length', truncation=True, is_pretokenized=False,
                 hf_batch_tfm=None, hf_input_return_type=HF_BaseInput,   
                 dl_type = SortedDL, tok_kwargs={}, batch_kwargs={}, **kwargs):
        
        if (hf_tok_tfm is None): 
            hf_tok_tfm = HF_TokenizerTransform(hf_arch, hf_tokenizer, max_length, 
                                               padding, truncation, is_pretokenized, **tok_kwargs)
            
        if (hf_batch_tfm is None): 
            hf_batch_tfm = HF_BatchTransform(hf_arch, hf_tokenizer, hf_input_return_type, **batch_kwargs)
            
        return super().__init__(type_tfms=hf_tok_tfm, dl_type=dl_type, dls_kwargs={ 'before_batch': hf_batch_tfm })            

`HF_TextBlock` has been dramatically simplified from it's predecessor.  It handles setting up your `HF_TokenizerTransform` and `HF_BatchTransform` transform regardless of data source (e.g., this will work with files, DataFrames, whatever).

In [None]:
#export
@typedispatch
def show_batch(x:HF_BaseInput, y, samples, dataloaders=None, ctxs=None, max_n=6, **kwargs):  
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)

    display_df(pd.DataFrame(ctxs))
    return ctxs

## Sequence classification

Below demonstrates how to contruct your `DataBlock` for a sequence classification task (e.g., a model that requires a single text input)

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)

model_path = Path('models')
imdb_df = pd.read_csv(path/'texts.csv')

In [None]:
imdb_df.head()

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False
2,negative,"Every once in a long while a movie will come along that will be so awful that I feel compelled to warn people. If I labor all my days and I can save but one soul from watching this movie, how great will be my joy.<br /><br />Where to begin my discussion of pain. For starters, there was a musical montage every five minutes. There was no character development. Every character was a stereotype. We had swearing guy, fat guy who eats donuts, goofy foreign guy, etc. The script felt as if it were being written as the movie was being shot. The production value was so incredibly low that it felt li...",False
3,positive,"Name just says it all. I watched this movie with my dad when it came out and having served in Korea he had great admiration for the man. The disappointing thing about this film is that it only concentrate on a short period of the man's life - interestingly enough the man's entire life would have made such an epic bio-pic that it is staggering to imagine the cost for production.<br /><br />Some posters elude to the flawed characteristics about the man, which are cheap shots. The theme of the movie ""Duty, Honor, Country"" are not just mere words blathered from the lips of a high-brassed offic...",False
4,negative,"This movie succeeds at being one of the most unique movies you've seen. However this comes from the fact that you can't make heads or tails of this mess. It almost seems as a series of challenges set up to determine whether or not you are willing to walk out of the movie and give up the money you just paid. If you don't want to feel slighted you'll sit through this horrible film and develop a real sense of pity for the actors involved, they've all seen better days, but then you realize they actually got paid quite a bit of money to do this and you'll lose pity for them just like you've alr...",False


There are a bunch of ways we can get at the four huggingface elements we need (e.g., architecture name, tokenizer, config, and model).  We can just create them directly, or we can use one of the helper methods available via `BLURR_MODEL_HELPER`.

In [None]:
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base" # "distilbert-base-uncased" "bert-base-uncased"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, task=task)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

Once you have those elements, you can create your `DataBlock` as simple as the below.

In [None]:
# single input
blocks = (HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), CategoryBlock)

dblock = DataBlock(blocks=blocks, 
                   get_x=ColReader('text'), 
                   get_y=ColReader('label'), 
                   splitter=ColSplitter(col='is_valid'))

In [None]:
# dblock.summary(imdb_df)

In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)

In [None]:
b = dls.one_batch(); len(b), len(b[0]['input_ids']), len(b[1]) 

(2, 4, 4)

Let's take a look at the actual types represented by our batch

In [None]:
explode_types(b)

{tuple: [dict, fastai2.torch_core.TensorCategory]}

In [None]:
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",negative
1,"""National Treasure"" (2004) is a thoroughly misguided hodge-podge of plot entanglements that borrow from nearly every cloak and dagger government conspiracy cliché that has ever been written. The film stars Nicholas Cage as Benjamin Franklin Gates (how precious is that, I ask you?); a seemingly normal fellow who, for no other reason than being of a lineage of like-minded misguided fortune hunters, decides to steal a 'national treasure' that has been hidden by the United States founding fathers. After a bit of subtext and background that plays laughably (unintentionally) like Indiana Jones meets The Patriot, the film degenerates into one misguided whimsy after another  attempting to create a 'Stanley Goodspeed' regurgitation of Nicholas Cage and launch the whole convoluted mess forward with a series of high octane, but disconnected misadventures.<br /><br />The relevancy and logic to having George Washington and his motley crew of patriots burying a king's ransom someplace on native soil, and then, going through the meticulous plan of leaving clues scattered throughout U.S. currency art work, is something that director Jon Turteltaub never quite gets around to explaining. Couldn't Washington found better usage for such wealth during the start up of the country? Hence, we are left with a mystery built on top of an enigma that is already on shaky ground by the time Ben appoints himself the new custodian of this untold wealth. Ben's intentions are noble  if confusing. He's set on protecting the treasure. For who and when?your guess is as good as mine.<br /><br />But there are a few problems with Ben's crusade. First up, his friend, Ian Holmes (Sean Bean) decides that he can't wait for Ben to make up his mind about stealing the Declaration of Independence from the National Archives (oh, yeah  brilliant idea!). Presumably, the back of that famous document holds the secret answer to the ultimate fortune. So Ian tries to kill Ben. The assassination attempt is, of course, unsuccessful, if overly melodramatic. It also affords Ben the opportunity to pick up, and pick on, the very sultry curator of the archives, Abigail Chase (Diane Kruger). She thinks Ben is clearly a nut  at least at the beginning. But true to action/romance form, Abby's resolve melts quicker than you can say, ""is that the Hope Diamond?""",negative


## Tests

The tests below to ensure the core DataBlock code above works for **all** pretrained sequence classification models available in huggingface.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained classification models you are working with ... and if any of your pretrained sequence classification models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
BLURR_MODEL_HELPER.get_models(task='SequenceClassification')

[transformers.modeling_albert.AlbertForSequenceClassification,
 transformers.modeling_auto.AutoModelForSequenceClassification,
 transformers.modeling_bart.BartForSequenceClassification,
 transformers.modeling_bert.BertForSequenceClassification,
 transformers.modeling_camembert.CamembertForSequenceClassification,
 transformers.modeling_distilbert.DistilBertForSequenceClassification,
 transformers.modeling_electra.ElectraForSequenceClassification,
 transformers.modeling_flaubert.FlaubertForSequenceClassification,
 transformers.modeling_longformer.LongformerForSequenceClassification,
 transformers.modeling_mobilebert.MobileBertForSequenceClassification,
 transformers.modeling_roberta.RobertaForSequenceClassification,
 transformers.modeling_xlm.XLMForSequenceClassification,
 transformers.modeling_xlm_roberta.XLMRobertaForSequenceClassification,
 transformers.modeling_xlnet.XLNetForSequenceClassification]

In [None]:
pretrained_model_names = [
    'albert-base-v1',
    'facebook/bart-base',
    'bert-base-uncased',
    'camembert-base',
    'distilbert-base-uncased',
    'monologg/electra-small-finetuned-imdb',
    'flaubert/flaubert_small_cased', 
    'allenai/longformer-base-4096',
    'google/mobilebert-uncased',
    'roberta-base',
    'xlm-mlm-en-2048',
    'xlm-roberta-base',
    'xlnet-base-cased'
]

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)

model_path = Path('models')
imdb_df = pd.read_csv(path/'texts.csv')

In [None]:
#slow
task = HF_TASKS_AUTO.SequenceClassification

for model_name in pretrained_model_names:
    print(f'=== {model_name} ===\n')
    
    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(model_name, task=task)
    
    print(f'architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n')

    blocks = (HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer, max_length=128), CategoryBlock)

    dblock = DataBlock(blocks=blocks, 
                       get_x=ColReader('text'), 
                       get_y=ColReader('label'), 
                       splitter=ColSplitter(col='is_valid'))
    
    dls = dblock.dataloaders(imdb_df, bs=4) 
    b = dls.one_batch()
    
    print('*** TESTING DataLoaders ***\n')
    try:
        test_eq(len(b), 2)
        test_eq(len(b[0]['input_ids']), 4)
        test_eq(b[0]['input_ids'].shape, torch.Size([4, 128]))
        test_eq(len(b[1]), 4)
        print('--- PASSED: Batch size and input shapes ---')

        if (hasattr(hf_tokenizer, 'add_prefix_space')):
            test_eq(dls.tfms[0].kwargs['add_prefix_space'], True)
            print('--- PASSED: Tokenizer results (prefix space included) ---')
        else:
            print('--- PASSED: Tokenizer results ---')

        dls.show_batch(dataloaders=dls, max_n=2)
    except:
        print('--- FAILED: DataLoaders ---\n')

=== albert-base-v1 ===



Some weights of the model checkpoint at albert-base-v1 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You sho

architecture:	albert
tokenizer:	AlbertTokenizer

*** TESTING DataLoaders ***

--- PASSED: Batch size and input shapes ---
--- PASSED: Tokenizer results ---


Unnamed: 0,text,category
0,"un-bleeping-believable! meg ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. hard to believe she was the producer on this dog. plus kevin kline: what kind of suicide trip has his career been on? whoosh... banzai!!! finally this was directed by the guy who did big chill? must be a replay of jonestown - hollywood style. wooofff!",negative
1,"when marlene dietrich was labeled box office poison in 1938 one of a handful of actresses so named by the trades papers, it was films like the garden of allah. how a film could be so breathtakingly beautiful to behold and be so insipidly dull is beyond me. also how marlene if she was trying to expand her range and not play a sexpot got stuck with such an old fashioned story is beyond me.br /br /the garden of allah, one of the very first films in modern technicolor was a novel set at the turn of the last century by",negative


=== facebook/bart-base ===



Some weights of the model checkpoint at facebook/bart-base were not used when initializing BartForSequenceClassification: ['final_logits_bias']
- This IS expected if you are initializing BartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BartForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.weight', 'classification_head.dense.bias', 'classification_head.out_proj.weight', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to u

architecture:	bart
tokenizer:	BartTokenizer

*** TESTING DataLoaders ***

--- PASSED: Batch size and input shapes ---
--- PASSED: Tokenizer results (prefix space included) ---


Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",negative
1,"The sitcom ""The league of Gentlemen"" follows the lives of several bizarre inhabitants of the fictional village ""Royston Vasey"". The different scenes are linked together by their common setting.<br /><br />In the first series, a sketch show, the main plot deals with a new road which is going to be built through Royston Vasey. Consequently, more foreigners visit the small town. But Edward and Tubbs, the owners of a ""local"" shop, which is actually far away from the town, do not like foreigners. Whenever a visitor enters their shop, they kill him. In my opinion some",negative


=== bert-base-uncased ===



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

architecture:	bert
tokenizer:	BertTokenizer

*** TESTING DataLoaders ***

--- PASSED: Batch size and input shapes ---
--- PASSED: Tokenizer results ---


Unnamed: 0,text,category
0,"un - bleeping - believable! meg ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. hard to believe she was the producer on this dog. plus kevin kline : what kind of suicide trip has his career been on? whoosh... banzai!!! finally this was directed by the guy who did big chill? must be a replay of jonestown - hollywood style. wooofff!",negative
1,"rock n'roll is a messy business and dig! demonstrates this masterfully. a project of serious ambition, and perhaps foolhardiness, the filmmaker is able to mend together seven tumultuous years of following around two unwieldy rock groups. with that said, the abundance of quality material ensures the film's ability to captivate the audience. if you've ever been interested in any realm of the music industry, this movie will undoubtedly be an arresting viewing. the music in the film, although it suffers minimally from requisite cutting and pasting, is worth the price of admission alone. the",positive


=== camembert-base ===



Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'cl

architecture:	camembert
tokenizer:	CamembertTokenizer

*** TESTING DataLoaders ***

--- PASSED: Batch size and input shapes ---
--- PASSED: Tokenizer results ---


Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown -",negative
1,"I consider myself a huge movie buff. I was sick on the couch and popped in this film. Right from the opening to the end I watched in awe at these great actors, i'd never seen, say great word. The filming was beautiful. It was just what I needed. I hope that this message is heard over any bad comments written by others. The Director has a heart and it beats with his actors throughout. Thanku for",positive


=== distilbert-base-uncased ===



Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

architecture:	distilbert
tokenizer:	DistilBertTokenizer

*** TESTING DataLoaders ***

--- PASSED: Batch size and input shapes ---
--- PASSED: Tokenizer results ---


Unnamed: 0,text,category
0,"un - bleeping - believable! meg ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. hard to believe she was the producer on this dog. plus kevin kline : what kind of suicide trip has his career been on? whoosh... banzai!!! finally this was directed by the guy who did big chill? must be a replay of jonestown - hollywood style. wooofff!",negative
1,"dvd has become the equivalent of the old late night double - bill circuit, the last chance to catch old movies on the verge of being completely forgotten like the border. there were great expectations for this back in 1982 a script co - written by the wild bunch's walon green, jack nicholson in the days when he could still act without semaphore and a great supporting cast ( harvey keitel, warren oates, valerie perrine ), tony richardson directing ( although he was pretty much a spent force by then ) but now it doesn't even turn up on tv. the material certainly offers a rich",negative


=== monologg/electra-small-finetuned-imdb ===



Some weights of the model checkpoint at monologg/electra-small-finetuned-imdb were not used when initializing ElectraForSequenceClassification: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/electra-small-finetuned-imdb and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a

architecture:	electra
tokenizer:	ElectraTokenizer

*** TESTING DataLoaders ***

--- PASSED: Batch size and input shapes ---
--- PASSED: Tokenizer results ---


Unnamed: 0,text,category
0,"un - bleeping - believable! meg ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. hard to believe she was the producer on this dog. plus kevin kline : what kind of suicide trip has his career been on? whoosh... banzai!!! finally this was directed by the guy who did big chill? must be a replay of jonestown - hollywood style. wooofff!",negative
1,"the only redeeming quality of this film is the actual storyline... otherwise, this movie was terrible. the acting was ridiculously bad, and the set design was cheesy and very tacky. the story was decent, but it was very hard to watch due to all the horrid acting. i wouldn't recommend watching this one... the only redeeming quality of this film was that the actors were somewhat attractive... especially ryan bauer, the man who plays the soap opera star. some of the editing was well done, but there are continuity errors all over the place..",negative


=== flaubert/flaubert_small_cased ===



Some weights of the model checkpoint at flaubert/flaubert_small_cased were not used when initializing FlaubertForSequenceClassification: ['pred_layer.proj.bias', 'pred_layer.proj.weight']
- This IS expected if you are initializing FlaubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing FlaubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of FlaubertForSequenceClassification were not initialized from the model checkpoint at flaubert/flaubert_small_cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for p

architecture:	flaubert
tokenizer:	FlaubertTokenizer

*** TESTING DataLoaders ***

--- PASSED: Batch size and input shapes ---
--- PASSED: Tokenizer results ---


Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn' t even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline : what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",negative
1,"We toss around the term "" superstar "" way too lightly these days, but here' s one guy that truly deserves it. < br / > < br / > I was glued to the set this entire show. The song selection was perfect -- it only contained the songs I actually wanted to hear and cut in with documentary footage during the weaker new songs. I loved that the band was just a five guys on stage in a very minimalist environment. ( With songs of this",positive


=== allenai/longformer-base-4096 ===



Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', '

architecture:	longformer
tokenizer:	LongformerTokenizer

*** TESTING DataLoaders ***

--- PASSED: Batch size and input shapes ---
--- PASSED: Tokenizer results (prefix space included) ---


Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",negative
1,"This is a worthless sequel to a great action movie. Cheap looking, and worst of all, BORING ACTION SCENES! The only decent thing about the movie is the last fight sequence. Only 82 minutes, but it feels like it goes on forever! Even die-hard Van Damme fans(like myself) should avoid this one!",negative


=== google/mobilebert-uncased ===



Some weights of the model checkpoint at google/mobilebert-uncased were not used when initializing MobileBertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MobileBertForSequence

architecture:	mobilebert
tokenizer:	MobileBertTokenizer

*** TESTING DataLoaders ***

--- PASSED: Batch size and input shapes ---
--- PASSED: Tokenizer results ---


Unnamed: 0,text,category
0,"un - bleeping - believable! meg ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. hard to believe she was the producer on this dog. plus kevin kline : what kind of suicide trip has his career been on? whoosh... banzai!!! finally this was directed by the guy who did big chill? must be a replay of jonestown - hollywood style. wooofff!",negative
1,"a wonderful and gritty war film that focuses on the inner torment of blinded marine al schmid. although it is tough and unpleasant it is in the end heroic - schmid's triumph over disability and depression. the battle scene was superb. but one bone to pick. no matter how many. 50 bullets they fired i never saw any water or dirt being kicked up by the impacts! it hurt the realism, but i can live with it. fine performance by eleanor parker, again, as his girl friend.",positive


=== roberta-base ===



Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

architecture:	roberta
tokenizer:	RobertaTokenizer

*** TESTING DataLoaders ***

--- PASSED: Batch size and input shapes ---
--- PASSED: Tokenizer results (prefix space included) ---


Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",negative
1,This movie was awful in the worst way: you just didn't care. You didn't care what happened in the plot; you didn't care about the characters. Everyone was devoid of heart. I ended up walking out about an 45 minutes into it because I simply didn't want to subject my mind to it any more. There is far too much sex in the film. Sex can be okay; it can even make the movie (hence Karma Sutra) but the intercourse here was not beautiful or sexy. It was just ugly. Don't see this film.,negative


=== xlm-mlm-en-2048 ===



Some weights of the model checkpoint at xlm-mlm-en-2048 were not used when initializing XLMForSequenceClassification: ['pred_layer.proj.weight', 'pred_layer.proj.bias']
- This IS expected if you are initializing XLMForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLMForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMForSequenceClassification were not initialized from the model checkpoint at xlm-mlm-en-2048 and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


architecture:	xlm
tokenizer:	XLMTokenizer

*** TESTING DataLoaders ***

--- PASSED: Batch size and input shapes ---
--- PASSED: Tokenizer results ---


Unnamed: 0,text,category
0,"un-bleeping-believable! meg ryan doesn 't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. hard to believe she was the producer on this dog. plus kevin kline : what kind of suicide trip has his career been on? whoosh... banzai!!! finally this was directed by the guy who did big chill? must be a replay of jonestown - hollywood style. wooofff!",negative
1,"i have never posted a review before, but i had to do it for this film! this film is so bad, i found myself trying to justify how bad it is by trying to think of it as kitsch or parody. but it isn 't. it is truly, un-self-consciously bad. this is a serious attempt that flops gloriously. other reviewers have pointed out the film's many flaws, so i 'll try not to repeat these, but i do urge you to see this film. throughout it i was either speechless, literally gasping with disbelief, or rolling on the floor in hysterics. i haven 't",negative


=== xlm-roberta-base ===



Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weig

architecture:	xlm_roberta
tokenizer:	XLMRobertaTokenizer

*** TESTING DataLoaders ***

--- PASSED: Batch size and input shapes ---
--- PASSED: Tokenizer results ---


Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",negative
1,"Okay, 'enjoy' is a pretty relative term, but flexibility is in order when you're dealing with a filmmaker of James Glickenhaus' calibre.<br /><br />McBain is truly one of the most ridiculous, over the top action films I've ever seen, without the nasty edge of The Exterminator. Other reviews have commented on a suspension of disbelief regarding the film's heroic middle aged commandos, but how about making a film in the Philippines that is set in Colombia? All the extras are Filipino. In fact the",negative


=== xlnet-base-cased ===



Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

architecture:	xlnet
tokenizer:	XLNetTokenizer

*** TESTING DataLoaders ***

--- PASSED: Batch size and input shapes ---
--- PASSED: Tokenizer results ---


Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",negative
1,"I do miss the company Vestron, they sure had their finger on the pulse of unique and unusual cinema back in the 1980s. This is very apparent with the astonishing Paperhouse, a film that touches me deeply each and every time I watch it.<br /><br />The idea of a girl manipulating a dream world with her drawings (thusly the dream world manipulating reality), and also connecting with and affecting the life of a boy she's never actually met, is fascinating and never disappoints. Charlotte Burke at first seems quite precocious and yet you warm up to her because",positive


## Example: Multi-label classification

Below demonstrates how to contruct your `DataBlock` for a multi-label classification task

In [None]:
# creates a dataset with the first 10% of training set
raw_data = nlp.load_dataset('civil_comments', split='train[:1%]') 
len(raw_data)

Using custom data configuration default


18049

In [None]:
toxic_df = pd.DataFrame(raw_data)
toxic_df.head()

Unnamed: 0,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!",0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyone get in your way!",0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,This is such an urgent design problem; kudos to you for taking it on. Very impressive!,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Is this something I'll be able to install on my site? When will you be releasing it?,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,haha you guys are a bunch of losers.,0.893617,0.021277,0.0,0.0,0.87234,0.021277,0.0


In [None]:
lbl_cols = list(toxic_df.columns[1:]); lbl_cols

['toxicity',
 'severe_toxicity',
 'obscene',
 'threat',
 'insult',
 'identity_attack',
 'sexual_explicit']

In [None]:
toxic_df = toxic_df.round({col: 0 for col in lbl_cols})
toxic_df.head()

Unnamed: 0,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!",0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyone get in your way!",0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,This is such an urgent design problem; kudos to you for taking it on. Very impressive!,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Is this something I'll be able to install on my site? When will you be releasing it?,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,haha you guys are a bunch of losers.,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base" # "distilbert-base-uncased" "bert-base-uncased"
n_labels = len(lbl_cols)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, 
                                                                               task=task, 
                                                                               config_kwargs={'num_labels': n_labels})

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

In [None]:
# single input
blocks = (HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), MultiCategoryBlock(encoded=True, vocab=lbl_cols))

dblock = DataBlock(blocks=blocks, 
                   get_x=ColReader('text'), 
                   get_y=ColReader(lbl_cols), 
                   splitter=RandomSplitter())

In [None]:
dls = dblock.dataloaders(toxic_df, bs=4)

In [None]:
b = dls.one_batch()
len(b), b[0]['input_ids'].shape, b[1].shape

(2, torch.Size([4, 512]), torch.Size([4, 7]))

In [None]:
dls.show_batch(dataloaders=dls, max_n=2)

Unnamed: 0,text,None
0,"We the public should know that an officer will not kill unless absolutely necessary. By keeping the officer's name secret, we remove the obligation to take responsibility or own one's actions. Keeping names in the public eye, we have the security of knowing that officers will be aware that they may be held accountable and thus be motivated to adopt less confrontational tactics in their work so as to avoid public exposure. By allowing the individuals who kill for the government to remain anonymous, we empower the government to kill. Do we really want that? Really?",
1,"As often on new government sanctioned journeys, there are issues to be worked out of our legalized marijuana. The State took care to make sure they got their money (taxes) but have done nothing to secure oversight that any of us and especially medical patients are consuming safe marijuana. Almost ALL dispensaries in Lane County have 'dropped' the organic. They have NO IDEA if pesticides were used. This is unsafe.....and just asking for tragedy. State: ""Where's the oversight and protection""? L2g",


## Cleanup

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01a_data-language-modeling.ipynb.
Converted 01c_data-question-answering.ipynb.
Converted 01d_data-token-classification.ipynb.
Converted 01e_data-text-generation.ipynb.
Converted 02_modeling-core.ipynb.
Converted 02a_modeling-language-modeling.ipynb.
Converted 02c_modeling-question-answering.ipynb.
Converted 02d_modeling-token-classification.ipynb.
Converted 02e_modeling-text-generation.ipynb.
Converted index.ipynb.
