In [None]:
# default_exp data.seq2seq.core

In [None]:
#all_slow

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.seq2seq.core

> This module contains the core seq2seq (e.g., language modeling, summarization, translation) bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data in a way modelable by Hugging Face transformer implementations.

In [None]:
#export
from functools import reduce

from fastcore.all import *
from fastai.data.block import DataBlock, CategoryBlock, ColReader, ColSplitter
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.text.data import SortedDL
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import (
    AutoModelForSeq2SeqLM, logging,
    PretrainedConfig, PreTrainedTokenizerBase, PreTrainedModel
)

from blurr.utils import BLURR
from blurr.data.core import (
    HF_TextBlock, HF_BaseInput, HF_BeforeBatchTransform, HF_AfterBatchTransform, first_blurr_tfm
)

logging.set_verbosity_error()

In [None]:
#hide_input
import pdb

from fastai.data.core import DataLoader, DataLoaders, TfmdDL
from fastai.data.external import untar_data, URLs
from fastai.data.transforms import *
from fastcore.test import *
from nbverbose.showdoc import show_doc
from transformers import BartForConditionalGeneration

from blurr.utils import print_versions
from blurr.data.core import HF_TextBlock

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions('torch fastai transformers')

What we're running with at the time this documentation was generated:
torch: 1.7.1
fastai: 2.5.2
transformers: 4.9.2


In [None]:
#hide
#cuda
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


In [None]:
pretrained_model_name = "facebook/bart-large-cnn"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, 
                                                                  model_cls=BartForConditionalGeneration)

hf_arch, type(hf_config), type(hf_tokenizer), type(hf_model)

('bart',
 transformers.models.bart.configuration_bart.BartConfig,
 transformers.models.bart.tokenization_bart_fast.BartTokenizerFast,
 transformers.models.bart.modeling_bart.BartForConditionalGeneration)

## Base tokenization, batch transform, and DataBlock methods

Seq2Seq tasks are essentially conditional generation tasks, this applies to specific derived tasks such as summarization and translation.  Given this, we can use the *same* HF_Seq2Seq transforms, `HF_Seq2SeqInput`, and `HF_Seq2SeqBlock` for these tasks

In [None]:
#export
class HF_Seq2SeqInput(HF_BaseInput): pass

We create a subclass of `HF_BeforeBatchTransform` for summarization tasks to add `decoder_input_ids` and `labels` to our inputs during training, which will in turn allow the Hugging Face model to calculate the loss for us.  See [here](https://huggingface.co/transformers/glossary.html#labels) and [here](https://huggingface.co/transformers/glossary.html#decoder-input-ids) for more information on these additional inputs used in summarization, translation, and conversational training tasks. How they should look for particular architectures can be found by looking at those model's `forward` function's docs (See [here](https://huggingface.co/transformers/model_doc/bart.html#transformers.BartModel.forward) for BART for example)

Note also that `labels` is simply target_ids shifted to the right by one since the task to is to predict the next token based on the current (and all previous) `decoder_input_ids`.

And lastly, we also update our targets to just be the `input_ids` of our target sequence so that fastai's `Learner.show_results` works (again, almost all the fastai bits require returning a single tensor to work).

In [None]:
#export
def default_text_gen_kwargs(hf_config, hf_model, task=None):
    text_gen_kwargs = {}
    hf_config_dict = hf_config.to_dict()

    generate_func_args = list(inspect.signature(hf_model.generate).parameters.keys())
    for k in generate_func_args:
        if (k in hf_config_dict): text_gen_kwargs.update({k: hf_config_dict[k]})
            
    # not all configs even have a task_specific_params property
    if (task is not None):
        try:
            text_gen_kwargs = { **text_gen_kwargs, **hf_config.task_specific_params[task] }
        except: pass
        
    return text_gen_kwargs

In [None]:
default_text_gen_kwargs(hf_config, hf_model)

{'max_length': 142,
 'min_length': 56,
 'do_sample': False,
 'early_stopping': True,
 'num_beams': 4,
 'temperature': 1.0,
 'top_k': 50,
 'top_p': 1.0,
 'repetition_penalty': 1.0,
 'bad_words_ids': None,
 'bos_token_id': 0,
 'pad_token_id': 1,
 'eos_token_id': 2,
 'length_penalty': 2.0,
 'no_repeat_ngram_size': 3,
 'encoder_no_repeat_ngram_size': 0,
 'num_return_sequences': 1,
 'decoder_start_token_id': 2,
 'use_cache': True,
 'num_beam_groups': 1,
 'diversity_penalty': 0.0,
 'output_attentions': False,
 'output_hidden_states': False,
 'output_scores': False,
 'return_dict_in_generate': False,
 'forced_bos_token_id': 0,
 'forced_eos_token_id': 2,
 'remove_invalid_values': False}

In [None]:
#hide
t = torch.randn((3,3));

F.pad(t, pad=(1,0), value=1)[:,:-1]

tensor([[ 1.0000,  0.5688, -0.4156],
        [ 1.0000, -0.1378, -0.5449],
        [ 1.0000,  0.1534,  0.0900]])

In [None]:
#export
class HF_Seq2SeqBeforeBatchTransform(HF_BeforeBatchTransform):
    
    def __init__(
        self, 
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch:str,   
        # A specific configuration instance you want to use
        hf_config:PretrainedConfig,   
        # A Hugging Face tokenizer
        hf_tokenizer:PreTrainedTokenizerBase,  
        # A Hugging Face model
        hf_model:PreTrainedModel,      
        # The token ID that should be ignored when calculating the loss
        ignore_token_id:int=CrossEntropyLossFlat().ignore_index,
        # To control the length of the padding/truncation of the input sequence. It can be an integer or None, 
        # in which case it will default to the maximum length the model can accept. If the model has no 
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length:int=None, 
        # To control the length of the padding/truncation of the target sequence. It can be an integer or None, 
        # in which case it will default to the maximum length the model can accept. If the model has no 
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_target_length:int=None, 
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to 
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding:Union[bool, str]=True, 
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation:Union[bool, str]=True, 
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words:bool=False, 
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs={}, 
        # Any keyword arguments to pass to the `hf_model.generate` method
        text_gen_kwargs={}, 
        # Keyword arguments to apply to `HF_BeforeBatchTransform`
        **kwargs
    ):     
        super().__init__(hf_arch, hf_config, hf_tokenizer, hf_model,
                         max_length=max_length, padding=padding, truncation=truncation, is_split_into_words=False, 
                         tok_kwargs=tok_kwargs.copy(), **kwargs)
        
        store_attr(self=self, names='text_gen_kwargs, max_target_length, ignore_token_id')
    
    def encodes(self, samples): 
        samples = L(samples)
        
        # tokenize
        src_texts=samples.itemgot(0).items
        tgt_texts=samples.itemgot(1).items if (len(samples[0]) > 1) else None
        
        tok_d = self.hf_tokenizer(src_texts, max_length=self.max_length, padding=self.padding, 
                                  truncation=self.truncation, return_tensors='pt', **self.tok_kwargs)

        if (tgt_texts):
            with self.hf_tokenizer.as_target_tokenizer():
                tok_d_targs = self.hf_tokenizer(tgt_texts, max_length=self.max_target_length, padding=self.padding, 
                                      truncation=self.truncation, return_tensors='pt', **self.tok_kwargs)

                tok_d['labels'] = tok_d_targs['input_ids']
        
        # add in target ids for us to use if fastai is calculating the loss
        targ_ids = [[]] * len(samples)
        if ('labels' in tok_d):
            tok_d['labels'].masked_fill_(tok_d['labels'] == self.ignore_token_id, self.hf_tokenizer.pad_token_id)
            targ_ids = tok_d['labels'].clone()

        # update samples with tokenized inputs (e.g. input_ids, attention_mask, etc...)
        d_keys = tok_d.keys()
        updated_samples= [ (*[{k: tok_d[k][idx] for k in d_keys}], *tuplify(targ_ids[idx]), *sample[2:]) 
                          for idx, sample in enumerate(samples) ]
        
        return updated_samples

We include a new AFTER batch `Transform` and `TransformBlock` specific to text-2-text tasks.

In [None]:
#export
class HF_Seq2SeqAfterBatchTransform(HF_AfterBatchTransform):
    def decodes(self, encoded_samples):
        input_ids = encoded_samples['input_ids'] if (isinstance(encoded_samples, dict)) else encoded_samples
        return self.input_return_type(input_ids, hf_tokenizer=self.hf_tokenizer)
    
    
class HF_Seq2SeqBlock(HF_TextBlock):
    
    def __init__(
        self, 
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_arch:str=None,          
        # A Hugging Face configuration object (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_config:PretrainedConfig=None,    
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_tokenizer:PreTrainedTokenizerBase=None,  
        # A Hugging Face model (not required if passing in an
        # instance of `HF_BeforeBatchTransform` to `before_batch_tfm`)
        hf_model:PreTrainedModel=None,                     
        # The before batch transform you want to use to tokenize your raw data on the fly 
        # (defaults to an instance of `HF_BeforeBatchTransform` created using the Hugging Face objects defined above)
        before_batch_tfm:HF_BeforeBatchTransform=None,             
        # The batch_tfms to apply to the creation of your DataLoaders, 
        # (defaults to HF_AfterBatchTransform created using the Hugging Face objects defined above)
        after_batch_tfm:HF_AfterBatchTransform=None,   
        # To control the length of the padding/truncation for the input sequence. It can be an integer or None, 
        # in which case it will default to the maximum length the model can accept. If the model has no 
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length:int=None,
        # To control the length of the padding/truncation for the target sequence. It can be an integer or None, 
        # in which case it will default to the maximum length the model can accept. If the model has no 
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-y
        max_target_length=None, 
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to 
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding:Union[bool, str]=True, 
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation:Union[bool, str]=True, 
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type=HF_Seq2SeqInput, 
        # The type of `DataLoader` you want created (defaults to `SortedDL`)
        dl_type=SortedDL, 
        # Any keyword arguments you want your Hugging Face tokenizer to use during tokenization
        tok_kwargs={}, 
        # Any keyword arguments you want to have applied with generating text
        # (default: default_text_gen_kwargs)
        text_gen_kwargs={},
        # Any keyword arguments you want applied to `HF_TextBlock`
        # Any keyword arguments you want applied to your before batch tfm
        before_batch_kwargs={}, 
        # Any keyword arguments you want applied to your after batch tfm (or referred to in fastai as `batch_tfms`)
        after_batch_kwargs={}, 
        # Any keyword arguments you want applied to `HF_TextBlock`
        **kwargs
    ):    
        # we need to pass text_gen_kwargs into our HF_Seq2SeqBeforeBatchTransform (use default unless specified)
        if (len(text_gen_kwargs) == 0): 
            if (hf_config is None): hf_config = before_batch_tfm.hf_config
            if (hf_model is None): hf_model = before_batch_tfm.hf_model
            self.text_gen_kwargs = default_text_gen_kwargs(hf_config, hf_model)
        else:
            self.text_gen_kwargs = text_gen_kwargs.copy()
            
        # construct our before_batch and after_batch tfms as usual
        if (before_batch_tfm is None): 
            before_batch_tfm = HF_Seq2SeqBeforeBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model,
                                                              max_length=max_length, 
                                                              max_target_length=max_target_length,
                                                              padding=padding, 
                                                              truncation=truncation,
                                                              tok_kwargs=tok_kwargs.copy(), 
                                                              text_gen_kwargs=text_gen_kwargs, 
                                                              **before_batch_kwargs.copy())

        if (after_batch_tfm is None): 
            hf_tokenizer = hf_tokenizer if (hf_tokenizer is not None) else before_batch_tfm.hf_tokenizer
            after_batch_tfm = HF_Seq2SeqAfterBatchTransform(hf_tokenizer, input_return_type,
                                                            **after_batch_kwargs.copy())
                
        return super().__init__(before_batch_tfm=before_batch_tfm, after_batch_tfm=after_batch_tfm,
                                max_length=max_length, padding=padding, truncation=truncation, 
                                is_split_into_words=False, 
                                input_return_type=input_return_type, dl_type=dl_type, 
                                tok_kwargs=tok_kwargs, 
                                before_batch_kwargs=before_batch_kwargs, 
                                after_batch_kwargs=after_batch_kwargs, 
                                **kwargs)          

... and a `DataLoaders.show_batch` for seq2seq tasks

In [None]:
#export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `HF_Seq2SeqInput` typed inputs
    x:HF_Seq2SeqInput, 
    # Your targets
    y,              
    # Your raw inputs/targets
    samples,        
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for 
    # decoding them into something understandable
    dataloaders,    
    # Your `show_batch` context
    ctxs=None, 
    # The maximum number of items to show
    max_n=6, 
    # Any truncation your want applied to your decoded inputs
    input_trunc_at=None, 
    # Any truncation your want applied to your decoded targets
    target_trunc_at=None, 
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs
):  
    # grab our tokenizer and ignore token to decode
    tfm = first_blurr_tfm(dataloaders)
    hf_tokenizer = tfm.hf_tokenizer
    ignore_token_id = tfm.ignore_token_id
    
    res = L([ (hf_tokenizer.decode(s[0], skip_special_tokens=False)[:input_trunc_at], 
               hf_tokenizer.decode(s[1][s[1] != ignore_token_id], skip_special_tokens=True)[:target_trunc_at])
             for s in samples ])      
    
    display_df(pd.DataFrame(res, columns=['text', 'target'])[:max_n])
    return ctxs

## Summary

This module includes the fundamental bits to all Seq2Seq transformers data preparation.

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted 99e_examples-causal-lm-gpt2.ipynb.
Converted index.ipynb.
