blurr/text/data/seq2seq/core.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../../../nbs/20_text-data-seq2seq-core.ipynb.

# %% auto 0
__all__ = ['Seq2SeqPreprocessor', 'Seq2SeqTextInput', 'Seq2SeqBatchTokenizeTransform', 'Seq2SeqBatchDecodeTransform',
           'default_text_gen_kwargs', 'Seq2SeqTextBlock', 'show_batch']

# %% ../../../../nbs/20_text-data-seq2seq-core.ipynb 5
import warnings
from typing import Optional

from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.text.data import SortedDL
from fastai.torch_core import *
from fastai.torch_imports import *
from fastcore.all import *
from transformers import PretrainedConfig, PreTrainedTokenizerBase, PreTrainedModel
from transformers.utils import logging as hf_logging

from ..core import BatchDecodeTransform, BatchTokenizeTransform, Preprocessor, TextBlock, TextInput, first_blurr_tfm
from ...utils import get_hf_objects


# %% ../../../../nbs/20_text-data-seq2seq-core.ipynb 7
# silence all the HF warnings
warnings.simplefilter("ignore")
hf_logging.set_verbosity_error()

# %% ../../../../nbs/20_text-data-seq2seq-core.ipynb 13
class Seq2SeqPreprocessor(Preprocessor):
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # The attribute holding the text
        text_attr: str = "text",
        # The maximum length (# of tokens) allowed for inputs. Will default to the max length allowed
        # by the model if not provided
        max_input_tok_length: Optional[int] = None,
        # The attribute holding the summary
        target_text_attr: str = "summary",
        # The maximum length (# of tokens) allowed for targets
        max_target_tok_length: Optional[int] = None,
        # The attribute that should be created if your are processing individual training and validation
        # datasets into a single dataset, and will indicate to which each example is associated
        is_valid_attr: Optional[str] = "is_valid",
        # Tokenization kwargs that will be applied with calling the tokenizer
        tok_kwargs: dict = {},
    ):
        # remove "max_length" if set on tok_kwargs as this is set differently for inputs and targets
        tok_kwargs.pop("max_length", None)

        super().__init__(hf_tokenizer, batch_size, text_attr, is_valid_attr, tok_kwargs=tok_kwargs)

        # inputs
        self.max_input_tok_length = max_input_tok_length if max_input_tok_length is not None else hf_tokenizer.model_max_length

        # targets
        self.target_text_attr = target_text_attr
        self.max_target_tok_length = max_target_tok_length

    def _tokenize_function(self, example):
        # tokenize inputs
        inputs = self.hf_tokenizer(example[self.text_attr], max_length=self.max_input_tok_length, **self.tok_kwargs)
        # tokenize targets
        with self.hf_tokenizer.as_target_tokenizer():
            targets = self.hf_tokenizer(example[self.target_text_attr], max_length=self.max_target_tok_length, **self.tok_kwargs)

        return (inputs, targets)


# %% ../../../../nbs/20_text-data-seq2seq-core.ipynb 16
class Seq2SeqTextInput(TextInput):
    pass


# %% ../../../../nbs/20_text-data-seq2seq-core.ipynb 20
class Seq2SeqBatchTokenizeTransform(BatchTokenizeTransform):
    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # To control whether the "labels" are included in your inputs. If they are, the loss will be calculated in
        # the model's forward function and you can simply use `PreCalculatedLoss` as your `Learner`'s loss function to use it
        include_labels: bool = True,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id: int = CrossEntropyLossFlat().ignore_index,
        # To control the length of the padding/truncation of the input sequence. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the length of the padding/truncation of the target sequence. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_target_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs={},
        # Any keyword arguments to pass to the `hf_model.generate` method
        text_gen_kwargs={},
        # Keyword arguments to apply to `BatchTokenizeTransform`
        **kwargs
    ):
        super().__init__(
            hf_arch,
            hf_config,
            hf_tokenizer,
            hf_model,
            include_labels=include_labels,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            is_split_into_words=False,
            tok_kwargs=tok_kwargs.copy(),
            **kwargs
        )

        store_attr()

    def encodes(self, samples):
        samples = L(samples)

        # tokenize
        src_texts = samples.itemgot(0).items
        tgt_texts = samples.itemgot(1).items if (len(samples[0]) > 1) else None

        # input text
        inputs = self.hf_tokenizer(
            src_texts, max_length=self.max_length, padding=self.padding, truncation=self.truncation, return_tensors="pt", **self.tok_kwargs
        )

        # target text
        targ_ids = [[]] * len(samples)
        if tgt_texts:
            with self.hf_tokenizer.as_target_tokenizer():
                targ_inputs = self.hf_tokenizer(
                    tgt_texts,
                    max_length=self.max_target_length,
                    padding=self.padding,
                    truncation=self.truncation,
                    return_tensors="pt",
                    **self.tok_kwargs
                )

                # padding tokens should be be changed to ignore_token_id so not factored into loss calculation
                targ_inputs["input_ids"].masked_fill_(targ_inputs["input_ids"] == self.hf_tokenizer.pad_token_id, self.ignore_token_id)

                # set targets to target input_ids (req. if calculating loss in fastai training loop and for show methods)
                targ_ids = targ_inputs["input_ids"].clone()

                # if we want hugging face to calculate loss, set the inputs "labels" = the target "input_ids" ... including the labels
                # will also tell the model to properly build the input's "decoder_input_ids" (right-shifted labels where the first token
                # is [PAD] or something similar)
                if self.include_labels:
                    inputs["labels"] = targ_inputs["input_ids"]
                else:
                    decoder_start_tok_id = self.hf_config.get("decoder_start_token_id", self.hf_config.pad_token_id)
                    inputs["decoder_input_ids"] = F.pad(targ_inputs["input_ids"].clone(), pad=(1, 0), value=decoder_start_tok_id)[:, :-1]
                    inputs["decoder_input_ids"].masked_fill_(
                        inputs["decoder_input_ids"] == self.ignore_token_id, self.hf_tokenizer.pad_token_id
                    )

        # update samples with tokenized inputs (e.g. input_ids, attention_mask, etc...)
        d_keys = inputs.keys()
        updated_samples = [
            (*[{k: inputs[k][idx] for k in d_keys}], *tuplify(targ_ids[idx]), *sample[2:]) for idx, sample in enumerate(samples)
        ]

        return updated_samples


# %% ../../../../nbs/20_text-data-seq2seq-core.ipynb 23
class Seq2SeqBatchDecodeTransform(BatchDecodeTransform):
    def decodes(self, encoded_samples):
        input_ids = encoded_samples["input_ids"] if (isinstance(encoded_samples, dict)) else encoded_samples
        return self.input_return_type(input_ids)


# %% ../../../../nbs/20_text-data-seq2seq-core.ipynb 25
def default_text_gen_kwargs(hf_config, hf_model, task=None):
    text_gen_kwargs = {}
    hf_config_dict = hf_config.to_dict()

    generate_func_args = list(inspect.signature(hf_model.generate).parameters.keys())
    for k in generate_func_args:
        if k in hf_config_dict:
            text_gen_kwargs.update({k: hf_config_dict[k]})

    # not all configs even have a task_specific_params property
    if task is not None:
        try:
            text_gen_kwargs = {**text_gen_kwargs, **hf_config.task_specific_params[task]}
        except:
            pass

    return text_gen_kwargs


# %% ../../../../nbs/20_text-data-seq2seq-core.ipynb 28
class Seq2SeqTextBlock(TextBlock):
    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_arch: str = None,
        # A Hugging Face configuration object (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_config: PretrainedConfig = None,
        # A Hugging Face tokenizer (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_tokenizer: PreTrainedTokenizerBase = None,
        # A Hugging Face model (not required if passing in an
        # instance of `BatchTokenizeTransform` to `before_batch_tfm`)
        hf_model: PreTrainedModel = None,
        # The before_batch_tfm you want to use to tokenize your raw data on the fly
        # (defaults to an instance of `BatchTokenizeTransform`)
        batch_tokenize_tfm: Optional[BatchTokenizeTransform] = None,
        # The batch_tfm you want to decode your inputs into a type that can be used in the fastai show methods,
        # (defaults to BatchDecodeTransform)
        batch_decode_tfm: Optional[BatchDecodeTransform] = None,
        # To control the length of the padding/truncation for the input sequence. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the length of the padding/truncation for the target sequence. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-y
        max_target_length=None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = True,
        # The return type your decoded inputs should be cast too (used by methods such as `show_batch`)
        input_return_type=Seq2SeqTextInput,
        # The type of `DataLoader` you want created (defaults to `SortedDL`)
        dl_type=SortedDL,
        # Any keyword arguments you want applied to your `batch_tokenize_tfm`
        batch_tokenize_kwargs: dict = {},
        # Any keyword arguments you want applied to your `batch_decode_tfm` (will be set as a fastai `batch_tfms`)
        batch_decode_kwargs: dict = {},
        # Any keyword arguments you want your Hugging Face tokenizer to use during tokenization
        tok_kwargs={},
        # Any keyword arguments you want to have applied with generating text
        # (default: default_text_gen_kwargs)
        text_gen_kwargs={},
        # Any keyword arguments you want applied to `TextBlock`
        **kwargs
    ):
        # we need to pass text_gen_kwargs into our Seq2SeqBatchTokenizeTransform (use default unless specified)
        if len(text_gen_kwargs) == 0:
            if hf_config is None:
                hf_config = batch_tokenize_tfm.hf_config
            if hf_model is None:
                hf_model = batch_tokenize_tfm.hf_model
            self.text_gen_kwargs = default_text_gen_kwargs(hf_config, hf_model)
        else:
            self.text_gen_kwargs = text_gen_kwargs.copy()

        # construct our before_batch and after_batch tfms as usual
        if batch_tokenize_tfm is None:
            batch_tokenize_tfm = Seq2SeqBatchTokenizeTransform(
                hf_arch,
                hf_config,
                hf_tokenizer,
                hf_model,
                max_length=max_length,
                max_target_length=max_target_length,
                padding=padding,
                truncation=truncation,
                tok_kwargs=tok_kwargs.copy(),
                text_gen_kwargs=text_gen_kwargs,
                **batch_tokenize_kwargs.copy()
            )

        if batch_decode_tfm is None:
            hf_tokenizer = hf_tokenizer if (hf_tokenizer is not None) else batch_tokenize_tfm.hf_tokenizer
            batch_decode_tfm = Seq2SeqBatchDecodeTransform(input_return_type, **batch_decode_kwargs.copy())

        return super().__init__(
            batch_tokenize_tfm=batch_tokenize_tfm,
            batch_decode_tfm=batch_decode_tfm,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            is_split_into_words=False,
            input_return_type=input_return_type,
            dl_type=dl_type,
            tok_kwargs=tok_kwargs,
            before_batch_kwargs=batch_tokenize_kwargs,
            after_batch_kwargs=batch_decode_kwargs,
            **kwargs
        )


# %% ../../../../nbs/20_text-data-seq2seq-core.ipynb 30
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `Seq2SeqTextInput` typed inputs
    x: Seq2SeqTextInput,
    # Your targets
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    input_trunc_at=None,
    # Any truncation your want applied to your decoded targets
    target_trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs
):
    # grab our tokenizer and ignore token to decode
    tfm = first_blurr_tfm(dataloaders)
    hf_tokenizer = tfm.hf_tokenizer
    ignore_token_id = tfm.ignore_token_id

    res = L(
        [
            (
                hf_tokenizer.decode(s[0], skip_special_tokens=False)[:input_trunc_at],
                hf_tokenizer.decode(s[1][s[1] != ignore_token_id], skip_special_tokens=True)[:target_trunc_at],
            )
            for s in samples
        ]
    )

    display_df(pd.DataFrame(res, columns=["text", "target"])[:max_n])
    return ctxs