In [None]:
# default_exp data.core

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.core

> This module contains the core bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data in a way modelable by huggingface transformer implementations.

In [None]:
#export
from functools import reduce

import torch, nlp
from transformers import *
from fastai2.text.all import *

from blurr.utils import *

In [None]:
#hide
import pdb

from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#cuda
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


## Base tokenization, batch transform, and DataBlock methods

In [None]:
#export
class HF_TokenizerTransform(ItemTransform):
    """huggingface friendly tokenization transfor."""
    def __init__(self, hf_arch, hf_tokenizer, max_length=None, padding='max_length', truncation=True, 
                 is_pretokenized=False, tok_kwargs={}):
        
        # gpt2, roberta, bart (and maybe others) tokenizers require a prefix space
        if (hasattr(hf_tokenizer, 'add_prefix_space')): tok_kwargs['add_prefix_space'] = True
        
        store_attr(self, 'hf_arch, hf_tokenizer, is_pretokenized, max_length, padding, truncation, tok_kwargs')
        
    def encodes(self, inp): 
        """Supports both string and list[str] inputs (the later is common for token classification tasks).
        Returns the numericalized (token_ids) of the input so no need to run this through a Numericalization
        transform."""
        inps = [inp, None] if (isinstance(inp, str) or self.is_pretokenized) else inp

        res = self.hf_tokenizer(inps[0], inps[1],
                                max_length=self.max_length,
                                padding=self.padding,
                                truncation=self.truncation,
                                is_pretokenized=self.is_pretokenized,
                                return_tensors='pt', 
                                **self.tok_kwargs)

        for k in res.keys(): res[k] = res[k].squeeze(0)
        return res
    
    def decodes(self, encoded_inp): 
        """This will get called multiple times for a given encoded input because our batch transform will add
        other elements to it (e.g., attention_mask, token_type_ids, etc...) as required by the defined huggingface
        tokenizer and model.  If it can't decode it, return None."""
        input_ids = filter(lambda el: el != self.hf_tokenizer.pad_token_id, encoded_inp[0].cpu().numpy())
        decoded_input = self.hf_tokenizer.decode(input_ids, skip_special_tokens=True)
        return TitledStr(decoded_input)
    

`HF_TokenizerTransform` was inspired by [this article](http://dev.fast.ai/tutorial.transformers).  It handles both the tokenization and numericalization traditionally split apart in the fastai text DataBlock API.  For huggingface tokenizers that require a prefix space, it will be included automatically.

You can pass a string or list into this Transform, the later being for tasks that require two input sequneces (e.g. question answer tasks for example require a "context" and a "question" sequence).

In order to make the tokenization/numericalization process more efficient, this transform has been updated to return a `transformers.tokenization_utils_base.BatchEncoding` dictionary with all the required transformer inputs (e.g. input_ids, attention_mask, etc...). Previously, it only returned the raw input_ids for *each* sequence which were then put together to come up with all the required inputs and padding in a `before_batch` transform.

In [None]:
#export
class HF_BaseInput(list): pass

A `HF_BaseInput` object is returned from the `decodes` method of `HF_BatchTransform` as a mean to customize @typedispatched functions like `DataLoaders.show_batch` and `Learner.show_results`.  It encapsulates a list with *one* item, the input_ids for the sequence.

In [None]:
#export
class HF_BatchTransform(Transform):
    """Handles everything you need to assemble a mini-batch of inputs and targets"""
    def __init__(self, hf_arch, hf_tokenizer, hf_input_return_type=HF_BaseInput, **kwargs):
        store_attr(self, 'hf_arch, hf_tokenizer, hf_input_return_type, kwargs')
        
    def encodes(self, samples): return samples
    
    def decodes(self, encoded_samples):
        if (isinstance(encoded_samples, dict)): return self.hf_input_return_type([encoded_samples['input_ids']])
        return encoded_samples

In [None]:
#export
class HF_TextBlock(TransformBlock):
    def __init__(self, hf_arch, hf_tokenizer, 
                 hf_tok_tfm=None, max_length=512, padding='max_length', truncation=True, is_pretokenized=False,
                 hf_batch_tfm=None, hf_input_return_type=HF_BaseInput,   
                 dl_type = SortedDL, **tok_kwargs):
        
        if (hf_tok_tfm is None): 
            hf_tok_tfm = HF_TokenizerTransform(hf_arch, hf_tokenizer, max_length, 
                                               padding, truncation, is_pretokenized, **tok_kwargs)
            
        if (hf_batch_tfm is None): 
            hf_batch_tfm = HF_BatchTransform(hf_arch, hf_tokenizer, hf_input_return_type)
            
        return super().__init__(type_tfms=hf_tok_tfm, dl_type=dl_type, dls_kwargs={ 'before_batch': hf_batch_tfm })            

`HF_TextBlock` has been dramatically simplified from it's predecessor.  It handles setting up your `HF_TokenizerTransform` and `HF_BatchTransform` transform regardless of data source (e.g., this will work with files, DataFrames, whatever).

In [None]:
#export
@typedispatch
def show_batch(x:HF_BaseInput, y, samples, hf_tokenizer=None, ctxs=None, max_n=6, **kwargs):  
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)

    display_df(pd.DataFrame(ctxs))
    return ctxs

## Example usage - Multi-class classification

Below demonstrates how to contruct your `DataBlock` for a sequence classification task (e.g., a model that requires a single text input)

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)

model_path = Path('models')
imdb_df = pd.read_csv(path/'texts.csv')

In [None]:
imdb_df.head()

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False
2,negative,"Every once in a long while a movie will come along that will be so awful that I feel compelled to warn people. If I labor all my days and I can save but one soul from watching this movie, how great will be my joy.<br /><br />Where to begin my discussion of pain. For starters, there was a musical montage every five minutes. There was no character development. Every character was a stereotype. We had swearing guy, fat guy who eats donuts, goofy foreign guy, etc. The script felt as if it were being written as the movie was being shot. The production value was so incredibly low that it felt li...",False
3,positive,"Name just says it all. I watched this movie with my dad when it came out and having served in Korea he had great admiration for the man. The disappointing thing about this film is that it only concentrate on a short period of the man's life - interestingly enough the man's entire life would have made such an epic bio-pic that it is staggering to imagine the cost for production.<br /><br />Some posters elude to the flawed characteristics about the man, which are cheap shots. The theme of the movie ""Duty, Honor, Country"" are not just mere words blathered from the lips of a high-brassed offic...",False
4,negative,"This movie succeeds at being one of the most unique movies you've seen. However this comes from the fact that you can't make heads or tails of this mess. It almost seems as a series of challenges set up to determine whether or not you are willing to walk out of the movie and give up the money you just paid. If you don't want to feel slighted you'll sit through this horrible film and develop a real sense of pity for the actors involved, they've all seen better days, but then you realize they actually got paid quite a bit of money to do this and you'll lose pity for them just like you've alr...",False


There are a bunch of ways we can get at the four huggingface elements we need (e.g., architecture name, tokenizer, config, and model).  We can just create them directly, or we can use one of the helper methods available via `BLURR_MODEL_HELPER`.

In [None]:
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base" # "distilbert-base-uncased" "bert-base-uncased"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, task=task)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

Once you have those elements, you can create your `DataBlock` as simple as the below.

In [None]:
# single input
blocks = (HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), CategoryBlock)

dblock = DataBlock(blocks=blocks, 
                   get_x=ColReader('text'), 
                   get_y=ColReader('label'), 
                   splitter=ColSplitter(col='is_valid'))

In [None]:
# dblock.summary(imdb_df)

In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)

In [None]:
b = dls.one_batch(); len(b), len(b[0]['input_ids']), len(b[1]) 

(2, 4, 4)

Let's take a look at the actual types represented by our batch

In [None]:
explode_types(b)

{tuple: [dict, fastai2.torch_core.TensorCategory]}

In [None]:
dls.show_batch(max_n=2)

Unnamed: 0,text,category
0,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",negative
1,"Why should you watch this? There are certainly no reasons why you shouldn't watch it! Superbly and amusingly directed by Albert and David Maysles, Grey Gardens was originally intended to be a film on the gentrification of East Hampton, but it turned out to the brothers that it would be more interesting to produce a study on the eccentric life of the two Edith Bouvier Beales, the aunt and cousin of Jacqueline Kennedy Onassis. Their life was certainly an amusing one (Edith spent most of her day in bed singing operas, Edie performing pirouettes and majorette dances with their many cats, one was named Ted Z. Kennedy) The film is interesting because it is both funny and sad - Edith died shortly after the film was released (in February 1977) aged 82 after experiencing some of the fame that she and Edie received after the film (she danced and sang in a nightclub Edie Beale Jr was born in 1925 and is still living in Miami Beach.This film is both engaging and spellbounding.",positive


## Example usage - Multi-label classification

Below demonstrates how to contruct your `DataBlock` for a multi-label classification task

In [None]:
# creates a dataset with the first 10% of training set
raw_data = nlp.load_dataset('civil_comments', split='train[:1%]') 
len(raw_data)

Using custom data configuration default


18049

In [None]:
toxic_df = pd.DataFrame(raw_data)
toxic_df.head()

Unnamed: 0,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!",0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyone get in your way!",0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,This is such an urgent design problem; kudos to you for taking it on. Very impressive!,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Is this something I'll be able to install on my site? When will you be releasing it?,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,haha you guys are a bunch of losers.,0.893617,0.021277,0.0,0.0,0.87234,0.021277,0.0


In [None]:
lbl_cols = list(toxic_df.columns[1:]); lbl_cols

['toxicity',
 'severe_toxicity',
 'obscene',
 'threat',
 'insult',
 'identity_attack',
 'sexual_explicit']

In [None]:
toxic_df = toxic_df.round({col: 0 for col in lbl_cols})
toxic_df.head()

Unnamed: 0,text,toxicity,severe_toxicity,obscene,threat,insult,identity_attack,sexual_explicit
0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!",0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyone get in your way!",0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,This is such an urgent design problem; kudos to you for taking it on. Very impressive!,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Is this something I'll be able to install on my site? When will you be releasing it?,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,haha you guys are a bunch of losers.,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
task = HF_TASKS_AUTO.SequenceClassification

pretrained_model_name = "roberta-base" # "distilbert-base-uncased" "bert-base-uncased"
n_labels = len(lbl_cols)

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, 
                                                                               task=task, 
                                                                               config_kwargs={'num_labels': n_labels})

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

In [None]:
# single input
blocks = (
    HF_TextBlock(hf_arch=hf_arch, hf_tokenizer=hf_tokenizer), 
    MultiCategoryBlock(encoded=True, vocab=lbl_cols)
)

dblock = DataBlock(blocks=blocks, 
                   get_x=ColReader('text'), 
                   get_y=ColReader(lbl_cols), 
                   splitter=RandomSplitter())

In [None]:
dls = dblock.dataloaders(toxic_df, bs=4)

In [None]:
b = dls.one_batch()
len(b), b[0]['input_ids'].shape, b[1].shape

(2, torch.Size([4, 512]), torch.Size([4, 7]))

In [None]:
dls.show_batch(hf_tokenizer=hf_tokenizer, max_n=2)

Unnamed: 0,text,None
0,"Ryan's family has already posted on Facebook saying that ""“the arrests stop here.” \n\nIt thus does not sound like he is going to allow himself to be arrested. Law enforcement will no doubt be forced to gun him down, just like they were forced to shoot Finicum. This is just so crazy that his family thinks that he can somehow avoid arrest. They are for all practical reasons sentencing him to death, by taking such an extreme stand.\n\nThe government really needs to crack down, and make an example out of these people. It looks like he is facing a mandatory 10 years in Federal prison, based on these charges.",
1,"Interesting take on religion Jeff, thanks. I remember all those ""devout, born-agains"" just loving Dubya because he was--like them--a ""good Christian man."" Who laughed as he sent a born-again woman to the gas chamber as governor of Texas (setting a record for executions only broken by his numerically-challenged successor, Rick Perry). Now we have those same ""born-agains"" lining up behind a thrice-wed bloodthirsty narcissistic buffoon like Trump (who, like Reagan, hasn't set foot in a church in decades), and a lying master of dirty tricks like Cruz, who fled the Catholic church, in Texas, for one with more voters. Most Europeans have gotten past tying religion and politics. If only we could. God may not be dead, but she certainly has better things to do than hang out in the voting booth. (Or kitchen table, as we fill out our mail-in ballots.)",


## Cleanup

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01a_data-language-modeling.ipynb.
Converted 01c_data-question-answering.ipynb.
Converted 01d_data-token-classification.ipynb.
Converted 01e_data-text-generation.ipynb.
Converted 02_modeling-core.ipynb.
Converted 02_training-summarization.ipynb.
Converted 02a_modeling-language-modeling.ipynb.
Converted 02c_modeling-question-answering.ipynb.
Converted 02d_modeling-token-classification.ipynb.
Converted 02e_modeling-text-generation.ipynb.
Converted index.ipynb.
