In [None]:
# default_exp data.question_answering


In [None]:
# all_slow


In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.question_answering

> Question/Answering tasks are models that require two text inputs (a context that includes the answer and the question).  The objective is to predict the start/end tokens of the answer in the context). This module contains the bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data for question/answering tasks.

In [None]:
# export
import ast
from functools import reduce

from datasets import Dataset
from fastcore.all import *
from fastai.data.block import DataBlock, CategoryBlock, ColReader, ColSplitter
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import AutoModelForQuestionAnswering, logging, PretrainedConfig, PreTrainedTokenizerBase, PreTrainedModel

from blurr.utils import BLURR
from blurr.data.core import TextInput, BatchDecodeTransform, BatchTokenizeTransform, Preprocessor, first_blurr_tfm

logging.set_verbosity_error()


In [None]:
# hide_input
import pdb

from datasets import load_dataset
from fastai.data.core import DataLoader, DataLoaders, TfmdDL
from fastai.data.external import untar_data, URLs
from fastai.data.transforms import *
from fastcore.test import *
from nbverbose.showdoc import show_doc

from blurr.utils import print_versions
from blurr.data.core import TextBlock

os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option('display.max_colwidth', 100)

print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")


What we're running with at the time this documentation was generated:
torch: 1.10.1+cu111
fastai: 2.5.3
transformers: 4.16.2


In [None]:
# hide
# cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")


Using GPU #1: GeForce GTX 1080 Ti


## Setup

We'll use a subset of `squad_v2` to demonstrate how to configure your blurr code for extractive question answering

In [None]:
train_ds = load_dataset("squad_v2", split='train[:1000]')

Reusing dataset squad_v2 (/home/wgilliam/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)


In [None]:
squad_df = pd.DataFrame(train_ds)

print(len(squad_df))
squad_df.head(2)


1000


Unnamed: 0,id,title,context,question,answers
0,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}"
1,56be85543aeaaa14008c9065,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",What areas did Beyonce compete in when she was growing up?,"{'text': ['singing and dancing'], 'answer_start': [207]}"


In [None]:
model_cls = AutoModelForQuestionAnswering

pretrained_model_name = "roberta-base"  #'xlm-mlm-ende-1024'
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)

max_seq_len = 128
vocab = dict(enumerate(range(max_seq_len)))


## Utility methods

Starting with version 2.0.0, blurr provides a `find_answer_token_idxs` method that can be used during preprocessing to find the start/end token indices from the start/end character indices commonly included in the raw data.  It returns tensors indicating the start/end token indicies and one indicating whether the answer can be found in the provided set of `input_ids`.

In [None]:
# export
def find_answer_token_idxs(start_char_idx, end_char_idx, offset_mapping, qst_mask):
    # mask the question tokens so they aren't included in the search
    masked_offset_mapping = offset_mapping.clone()
    masked_offset_mapping[qst_mask] = tensor([-100, -100])

    # based on the character start/end index, see if we can find the span of tokens in the `offset_mapping`
    starts = torch.where((masked_offset_mapping[:, 0] == start_char_idx) | (masked_offset_mapping[:, 1] == start_char_idx))[0]
    ends = torch.where((masked_offset_mapping[:, 0] <= end_char_idx) & (masked_offset_mapping[:, 1] >= end_char_idx))[0]

    if len(starts) > 0 and len(ends) > 0:
        for s in starts:
            if masked_offset_mapping[s][0] <= start_char_idx:
                start = s

        for e in ends:
            if e >= s and masked_offset_mapping[e][1] >= end_char_idx:
                end = e

        if end < len(masked_offset_mapping):
            return (start, end)

    # if neither star or end is found, or the end token is part of this chunk, consider the answer not found
    return (tensor(0), tensor(0))


In [None]:
show_doc(find_answer_token_idxs)


<h4 id="find_answer_token_idxs" class="doc_header"><code>find_answer_token_idxs</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>find_answer_token_idxs</code>(**`start_char_idx`**, **`end_char_idx`**, **`offset_mapping`**, **`qst_mask`**)



**Parameters:**


 - **`start_char_idx`** : *`<class 'inspect._empty'>`*

 - **`end_char_idx`** : *`<class 'inspect._empty'>`*

 - **`offset_mapping`** : *`<class 'inspect._empty'>`*

 - **`qst_mask`** : *`<class 'inspect._empty'>`*


## Preprocessing

With version 2.0.0 of blurr, we include a `Preprocessor` for question answering that can either truncate texts or else chunk long documents into multiple examples.

**Important**: Unlike other NLP tasks in BLURR, extractive question answering ***requires*** preprocessing in order to convert our raw start/end character indices into start/end token indices to be used as our labels. We are therefore precluded from figuring them out at batch time since they must be specified as the targets we want to predict.

In addition to returning the appropriate start/end token indices for each answer, the preprocessing method here also returns the `input_ids` which ***must*** be used when chunking long documents, and in general, should be used in all cases for consistency sake.

In [None]:
# export
class QuestionAnsweringPreprocessor(Preprocessor):
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # The unique identifier in the dataset. If not specified and "return_overflowing_tokens": True, an "_id" attribute
        # will be added to your dataset with its value a unique, sequential integer, assigned to each record
        id_attr: Optional[str] = None,
        # The attribute in your dataset that contains the context (where the answer is included) (default: 'context')
        ctx_attr: str = "context",
        # The attribute in your dataset that contains the question being asked (default: 'question')
        qst_attr: str = "question",
        # The attribute in your dataset that contains the actual answer (default: 'answer_text')
        ans_attr: str = "answer_text",
        # The attribute in your dataset that contains the actual answer (default: 'answer_text')
        ans_start_char_idx: str = "ans_start_char_idx",
        # The attribute in your dataset that contains the actual answer (default: 'answer_text')
        ans_end_char_idx: str = "ans_end_char_idx",
        # The attribute that should be created if your are processing individual training and validation
        # datasets into a single dataset, and will indicate to which each example is associated
        is_valid_attr: Optional[str] = "is_valid",
        # Tokenization kwargs that will be applied with calling the tokenizer (default: {"return_overflowing_tokens": True})
        tok_kwargs: dict = {"return_overflowing_tokens": True},
    ):
        # these values are mandatory
        tok_kwargs["return_offsets_mapping"] = True  # allows us to map tokens -> raw characters
        tok_kwargs["padding"] = tok_kwargs.get("padding", True)
        tok_kwargs["return_tensors"] = "pt"

        # shift the question and context appropriately based on the tokenizers padding strategy
        if hf_tokenizer.padding_side == "right":
            tok_kwargs["truncation"] = "only_second"
            text_attrs = [qst_attr, ctx_attr]
        else:
            tok_kwargs["truncation"] = "only_first"
            text_attrs = [ctx_attr, qst_attr]

        super().__init__(hf_tokenizer, batch_size, text_attrs=text_attrs, tok_kwargs=tok_kwargs)

        self.id_attr = id_attr
        self.qst_attr, self.ctx_attr = qst_attr, ctx_attr
        self.ans_attr, self.ans_start_char_idx, self.ans_end_char_idx = ans_attr, ans_start_char_idx, ans_end_char_idx
        self.is_valid_attr = is_valid_attr

    def process_df(self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None):
        df = super().process_df(training_df, validation_df)

        # a unique Id for each example is required to properly score question answering results when chunking long
        # documents (e.g., return_overflowing_tokens=True)
        if self.id_attr is None and self.tok_kwargs.get("return_overflowing_tokens", False):
            df.insert(0, "_id", range(len(df)))

        proc_data = []
        for row_idx, row in df.iterrows():
            # fetch data elements required to build a modelable dataset
            inputs = self._tokenize_function(row)
            ans_text, start_char_idx, end_char_idx = row[self.ans_attr], row[self.ans_start_char_idx], row[self.ans_end_char_idx] + 1

            # if "return_overflowing_tokens = True", our BatchEncoding will include an "overflow_to_sample_mapping" list
            overflow_mapping = inputs["overflow_to_sample_mapping"] if ("overflow_to_sample_mapping" in inputs) else [0]

            for idx in range(len(overflow_mapping)):
                # update the targets: is_found (s[1]), answer start token index (s[2]), and answer end token index (s[3])
                qst_mask = [i != 1 if self.hf_tokenizer.padding_side == "right" else i != 0 for i in inputs.sequence_ids(idx)]
                start, end  = find_answer_token_idxs(start_char_idx, end_char_idx, inputs["offset_mapping"][idx], qst_mask)

                overflow_row = row.copy()
                overflow_row[self.ans_end_char_idx] = end_char_idx
                overflow_row["ans_start_token_idx"] = start.item()
                overflow_row["ans_end_token_idx"] = end.item()

                for k in inputs.keys():
                    overflow_row[k] = inputs[k][idx].numpy()

                proc_data.append(overflow_row)

        return pd.DataFrame(proc_data)

    def process_hf_dataset(self, training_ds: Dataset, validation_ds: Optional[Dataset] = None):
        ds = super().process_hf_dataset(training_ds, validation_ds)

        # return the pre-processed DataFrame
        return ds


### Flatten your raw dataset (as needed) 

The `QuestionAnsweringPreprocessor` class requires that start/end character indicies exist in their own respective columns

In [None]:
squad_df["ans_start_char_idx"] = squad_df.answers.apply(lambda v: v["answer_start"][0])
squad_df["answer_text"] = squad_df.answers.apply(lambda v: v["text"][0])
squad_df["ans_end_char_idx"] = squad_df["ans_start_char_idx"].astype(int) + squad_df["answer_text"].str.len()

print(len(squad_df))
squad_df.head(2)


1000


Unnamed: 0,id,title,context,question,answers,ans_start_char_idx,answer_text,ans_end_char_idx
0,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}",269,in the late 1990s,286
1,56be85543aeaaa14008c9065,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",What areas did Beyonce compete in when she was growing up?,"{'text': ['singing and dancing'], 'answer_start': [207]}",207,singing and dancing,226


### How to preprocess your data

In [None]:
tok_kwargs = {"return_overflowing_tokens": True, "max_length": max_seq_len, "stride": 64}
preprocessor = QuestionAnsweringPreprocessor(hf_tokenizer, id_attr="id", tok_kwargs=tok_kwargs)
proc_df = preprocessor.process_df(squad_df)

print(len(proc_df))
proc_df.head()

3109


Unnamed: 0,id,title,context,question,answers,ans_start_char_idx,answer_text,ans_end_char_idx,ans_start_token_idx,ans_end_token_idx,input_ids,attention_mask,offset_mapping,overflow_to_sample_mapping
0,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}",269,in the late 1990s,287,84,89,"[0, 520, 222, 12674, 1755, 386, 1959, 1406, 116, 2, 2, 12674, 12695, 272, 354, 6591, 10690, 1634...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 0], [0, 4], [5, 8], [9, 14], [14, 16], [17, 22], [23, 31], [32, 39], [39, 40], [0, 0], [0, ...",0
0,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}",269,in the late 1990s,287,32,37,"[0, 520, 222, 12674, 1755, 386, 1959, 1406, 116, 2, 2, 11, 2499, 6, 1184, 6, 79, 3744, 11, 1337,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 0], [0, 4], [5, 8], [9, 14], [14, 16], [17, 22], [23, 31], [32, 39], [39, 40], [0, 0], [0, ...",0
0,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}",269,in the late 1990s,287,0,0,"[0, 520, 222, 12674, 1755, 386, 1959, 1406, 116, 2, 2, 333, 1059, 65, 9, 5, 232, 18, 275, 12, 11...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 0], [0, 4], [5, 8], [9, 14], [14, 16], [17, 22], [23, 31], [32, 39], [39, 40], [0, 0], [0, ...",0
1,56be85543aeaaa14008c9065,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",What areas did Beyonce compete in when she was growing up?,"{'text': ['singing and dancing'], 'answer_start': [207]}",207,singing and dancing,227,77,80,"[0, 653, 911, 222, 12674, 1755, 3511, 11, 77, 79, 21, 1197, 62, 116, 2, 2, 12674, 12695, 272, 35...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 0], [0, 4], [5, 10], [11, 14], [15, 20], [20, 22], [23, 30], [31, 33], [34, 38], [39, 42], ...",0
1,56be85543aeaaa14008c9065,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",What areas did Beyonce compete in when she was growing up?,"{'text': ['singing and dancing'], 'answer_start': [207]}",207,singing and dancing,227,30,33,"[0, 653, 911, 222, 12674, 1755, 3511, 11, 77, 79, 21, 1197, 62, 116, 2, 2, 3390, 4, 8912, 8, 117...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 0], [0, 4], [5, 10], [11, 14], [15, 20], [20, 22], [23, 30], [31, 33], [34, 38], [39, 42], ...",0


In [None]:
sampled_df = proc_df.sample(n=10)
for row_idx, row in sampled_df.iterrows():
    test_example = row

    if test_example.ans_start_token_idx != 0 and test_example.ans_end_token_idx != 0:
        test_eq(
            test_example.answer_text,
            hf_tokenizer.decode(test_example["input_ids"][test_example.ans_start_token_idx : test_example.ans_end_token_idx]).strip(),
        )
    else:
        test_eq(test_example.ans_start_token_idx, 0)
        test_eq(test_example.ans_end_token_idx, 0)


If you want to remove texts longer than your model will hold (and include only answerable contexts)

In [None]:
preprocessor = QuestionAnsweringPreprocessor(hf_tokenizer, tok_kwargs={"return_overflowing_tokens": False, "max_length": max_seq_len})
proc2_df = preprocessor.process_df(squad_df)
proc2_df = proc2_df[(proc2_df.ans_end_token_idx < max_seq_len) & (proc2_df.ans_start_token_idx != 0) & (proc2_df.ans_end_token_idx != 0)]

print(len(proc2_df))
proc2_df.head()


731


Unnamed: 0,id,title,context,question,answers,ans_start_char_idx,answer_text,ans_end_char_idx,ans_start_token_idx,ans_end_token_idx,input_ids,attention_mask,offset_mapping
0,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}",269,in the late 1990s,287,84,89,"[0, 520, 222, 12674, 1755, 386, 1959, 1406, 116, 2, 2, 12674, 12695, 272, 354, 6591, 10690, 1634...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 0], [0, 4], [5, 8], [9, 14], [14, 16], [17, 22], [23, 31], [32, 39], [39, 40], [0, 0], [0, ..."
1,56be85543aeaaa14008c9065,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",What areas did Beyonce compete in when she was growing up?,"{'text': ['singing and dancing'], 'answer_start': [207]}",207,singing and dancing,227,77,80,"[0, 653, 911, 222, 12674, 1755, 3511, 11, 77, 79, 21, 1197, 62, 116, 2, 2, 12674, 12695, 272, 35...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 0], [0, 4], [5, 10], [11, 14], [15, 20], [20, 22], [23, 30], [31, 33], [34, 38], [39, 42], ..."
3,56bf6b0f3aeaaa14008c9601,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",In what city and state did Beyonce grow up?,"{'text': ['Houston, Texas'], 'answer_start': [166]}",166,"Houston, Texas",181,69,72,"[0, 96, 99, 343, 8, 194, 222, 12674, 1755, 1437, 1733, 62, 116, 1437, 2, 2, 12674, 12695, 272, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 0], [0, 2], [3, 7], [8, 12], [13, 16], [17, 22], [23, 26], [27, 32], [32, 34], [35, 35], [3..."
4,56bf6b0f3aeaaa14008c9602,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",In which decade did Beyonce become famous?,"{'text': ['late 1990s'], 'answer_start': [276]}",276,late 1990s,287,87,90,"[0, 96, 61, 2202, 222, 12674, 1755, 555, 3395, 116, 2, 2, 12674, 12695, 272, 354, 6591, 10690, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 0], [0, 2], [3, 8], [9, 15], [16, 19], [20, 25], [25, 27], [28, 34], [35, 41], [41, 42], [0..."
5,56bf6b0f3aeaaa14008c9603,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",In what R&B group was she the lead singer?,"{'text': ['Destiny's Child'], 'answer_start': [320]}",320,Destiny's Child,336,103,106,"[0, 96, 99, 248, 947, 387, 333, 21, 79, 5, 483, 3250, 116, 2, 2, 12674, 12695, 272, 354, 6591, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 0], [0, 2], [3, 7], [8, 9], [9, 10], [10, 11], [12, 17], [18, 21], [22, 25], [26, 29], [30,..."


## Mid-level API

### `QuestionAnswerTextInput`

In [None]:
# export
class QuestionAnswerTextInput(TextInput):
    pass


### `QABatchTokenizeTransform`

In [None]:
# export
class QABatchTokenizeTransform(BatchTokenizeTransform):
    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # Contray to other NLP tasks where batch-time tokenization (`is_pretokenized` = False) is the default, with
        # extractive question answering pre-processing is required, and as such we set it to True here
        is_pretokenized: bool = True,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id=CrossEntropyLossFlat().ignore_index,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = "only_second",
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs. 
        # Since extractive requires pre-tokenized input_ids, we default this to not include any "special" tokens as they are already
        # included in the pre-processed input_ids
        tok_kwargs: dict = {"add_special_tokens": False},
        # Keyword arguments to apply to `BatchTokenizeTransform`
        **kwargs
    ):

        # "return_special_tokens_mask" and "return_offsets_mapping" are mandatory for extractive QA in blurr
        tok_kwargs = { **tok_kwargs, **{"return_special_tokens_mask": True, "return_offsets_mapping": True}}

        super().__init__(
            hf_arch,
            hf_config,
            hf_tokenizer,
            hf_model,
            is_pretokenized=is_pretokenized,
            ignore_token_id=ignore_token_id,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            is_split_into_words=is_split_into_words,
            tok_kwargs=tok_kwargs,
            **kwargs
        )

    def encodes(self, samples):
        samples, batch_encoding = super().encodes(samples, return_batch_encoding=True)

        for idx, s in enumerate(samples):
            # cls_index: location of CLS token (used by xlnet and xlm); is a list.index(value) for pytorch tensor's
            s[0]["cls_index"] = (s[0]["input_ids"] == self.hf_tokenizer.cls_token_id).nonzero()[0]
            # p_mask: mask with 1 for token than cannot be in the answer, else 0 (used by xlnet and xlm)
            s[0]["p_mask"] = s[0]["special_tokens_mask"]

        return samples


## Examples

The following eamples demonstrate several approaches to construct your `DataBlock` for question answering tasks using the mid-level API

### Using the mid-level API

#### Batch-Time Tokenization

##### Step 1: Get your Hugging Face objects

In [None]:
pretrained_model_name = "distilroberta-base"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=AutoModelForQuestionAnswering)

max_seq_len = 128
vocab = dict(enumerate(range(max_seq_len)))

#####  Step 2: Preprocess dataset

In [None]:
tok_kwargs = {"return_overflowing_tokens": True, "max_length": max_seq_len, "stride": 64}
preprocessor = QuestionAnsweringPreprocessor(hf_tokenizer, id_attr="id", tok_kwargs=tok_kwargs)
proc_df = preprocessor.process_df(squad_df)

proc_df.head(1)

Unnamed: 0,id,title,context,question,answers,ans_start_char_idx,answer_text,ans_end_char_idx,ans_start_token_idx,ans_end_token_idx,input_ids,attention_mask,offset_mapping,overflow_to_sample_mapping
0,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}",269,in the late 1990s,287,84,89,"[0, 520, 222, 12674, 1755, 386, 1959, 1406, 116, 2, 2, 12674, 12695, 272, 354, 6591, 10690, 1634...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 0], [0, 4], [5, 8], [9, 14], [14, 16], [17, 22], [23, 31], [32, 39], [39, 40], [0, 0], [0, ...",0


#####  Step 3: Create your `DataBlock`

In [None]:
before_batch_tfm = QABatchTokenizeTransform(hf_arch, hf_config, hf_tokenizer, hf_model, max_length=max_seq_len)

blocks = (
    TextBlock(batch_tokenize_tfm=before_batch_tfm, input_return_type=QuestionAnswerTextInput),
    CategoryBlock(vocab=vocab),
    CategoryBlock(vocab=vocab)
)

dblock = DataBlock(
    blocks=blocks,
    get_x=ColReader('input_ids'),
    get_y=[ColReader('ans_start_token_idx'), ColReader('ans_end_token_idx')],
    splitter=RandomSplitter(),
    n_inp=1
)


##### Step 4: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(proc_df, bs=4)
len(dls.train), len(dls.valid)


(622, 156)

In [None]:
b = dls.one_batch()
len(b), len(b[0]), len(b[1]), len(b[2])


(3, 3, 4, 4)

In [None]:
b[0]["input_ids"].shape, b[0]["attention_mask"].shape, b[1].shape, b[2].shape


(torch.Size([4, 128]), torch.Size([4, 128]), torch.Size([4]), torch.Size([4]))

In [None]:
# export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `QuestionAnswerTextInput` typed inputs
    x: QuestionAnswerTextInput,
    # Your targets
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs
):
    # grab our tokenizer
    tfm = first_blurr_tfm(dataloaders, tfms=[QABatchTokenizeTransform])
    hf_tokenizer = tfm.hf_tokenizer

    res = L()
    for sample, input_ids, start, end in zip(samples, x, *y):
        txt = hf_tokenizer.decode(sample[0], skip_special_tokens=True)[:trunc_at]
        found = (start.item() != 0 and end.item() != 0)
        ans_text = hf_tokenizer.decode(input_ids[start:end], skip_special_tokens=False)
        res.append((txt, found, (start.item(), end.item()), ans_text))

    display_df(pd.DataFrame(res, columns=["text", "found", "start/end", "answer"])[:max_n])
    return ctxs


The `show_batch` method above allows us to create a more interpretable view of our question/answer data.

In [None]:
dls.show_batch(dataloaders=dls, max_n=4)


Unnamed: 0,text,found,start/end,answer
0,"If Beyonce won three Grammies in 2015, how many was she nominated for? early September and also the Global Citizen Festival later that month. Beyoncé made an uncredited featured appearance on the track ""Hymn for the Weekend"" by British rock band Coldplay, on their seventh studio album A Head Full of Dreams (2015), which saw release in December. On January 7, 2016, Pepsi announced Beyoncé would perform alongside Coldplay at Super Bowl 50 in February. Knowles has previously performed at four Super Bowl shows throughout her career, serving as the main headliner of the 47th Super Bowl halftime",False,"(0, 0)",
1,"What song did Beyoncé perform at the first inaugural dance for the Obamas. the Neighborhood Ball two days later. Beyoncé and Jay Z held a fundraiser at the latter's 40/40 Club in Manhattan for Obama's 2012 presidential campaign which raised $4 million. Beyoncé uploaded pictures of her paper ballot on Tumblr, confirming she had voted in support for the Democratic Party and to encourage others to do so. She also performed the American national anthem at his second inauguration, singing along with a pre-recorded track. She publicly endorsed same sex marriage on March 26, 2013, after the Supreme Court debate on California's Proposition",False,"(0, 0)",
2,"What establishments did Frédéric frequently visit in Paris that influenced his career? and often technically demanding; his own performances were noted for their nuance and sensitivity. Chopin invented the concept of instrumental ballade. His major piano works also include mazurkas, waltzes, nocturnes, polonaises, études, impromptus, scherzos, preludes and sonatas, some published only after his death. Influences on his compositional style include Polish folk music, the classical tradition of J. S. Bach, Mozart and Schubert, the",False,"(0, 0)",
3,"Who did Beyoncé team up with in 2010 to get her fashions into Brazil?, a ""high-style"" mobile game with a social networking component, featuring the House of Deréon collection. In July 2009, Beyoncé and her mother launched a new junior apparel label, Sasha Fierce for Deréon, for back-to-school selling. The collection included sportswear, outerwear, handbags, footwear, eyewear, lingerie and jewelry. It was available at department stores including Macy's and Dillard's, and specialty stores Jimmy Jazz and Against All Odds. On",False,"(0, 0)",


#### Passing extra information

As mentioned in the data.core module documentation, BLURR now also allows you to pass extra information alongside your inputs in the form of a dictionary.  If we are splitting long documents into chunks but want to predict/aggregation by example (rather than by chunk), we'll want to at least include a unique identifier for each example. When we look at `modeling.question_answer` module, we'll see how the question answering bits can use such an Id for this purpose.


##### Step 1: Get your Hugging Face objects

In [None]:
pretrained_model_name ='bert-large-uncased-whole-word-masking-finetuned-squad' # "roberta-base"
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=AutoModelForQuestionAnswering)

max_seq_len = 128
vocab = dict(enumerate(range(max_seq_len)))

#####  Step 2: Preprocess dataset

In [None]:
preprocessor = QuestionAnsweringPreprocessor(
    hf_tokenizer,
    id_attr="id",
    ctx_attr="context",
    qst_attr="question",
    ans_attr="answer_text",
    ans_start_char_idx="ans_start_char_idx",
    ans_end_char_idx="ans_end_char_idx",
    tok_kwargs={"return_overflowing_tokens": True, "max_length": max_seq_len, "stride": 2},
)
proc_df = preprocessor.process_df(squad_df)
proc_df.head(1)

Unnamed: 0,id,title,context,question,answers,ans_start_char_idx,answer_text,ans_end_char_idx,ans_start_token_idx,ans_end_token_idx,input_ids,token_type_ids,attention_mask,offset_mapping,overflow_to_sample_mapping
0,56be85543aeaaa14008c9063,Beyoncé,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an America...",When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start': [269]}",269,in the late 1990s,287,75,79,"[101, 2043, 2106, 20773, 2707, 3352, 2759, 1029, 102, 20773, 21025, 19358, 22815, 1011, 5708, 10...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","[[0, 0], [0, 4], [5, 8], [9, 16], [17, 22], [23, 31], [32, 39], [39, 40], [0, 0], [0, 7], [8, 10...",0


##### Step 2: Create your `DataBlock`

In [None]:
before_batch_tfm = QABatchTokenizeTransform(hf_arch, hf_config, hf_tokenizer, hf_model, max_length=max_seq_len)

blocks = (
    TextBlock(batch_tokenize_tfm=before_batch_tfm, input_return_type=QuestionAnswerTextInput),
    CategoryBlock(vocab=vocab),
    CategoryBlock(vocab=vocab),
)

# since its pre-tokenized, we include an "input_ids" key with the value being the preprocessed input_ids
def get_x(item):
    return {"input_ids": item.input_ids, "id": item.id}


dblock = DataBlock(
    blocks=blocks,
    get_x=get_x,
    get_y=[ItemGetter("ans_start_token_idx"), ItemGetter("ans_end_token_idx")],
    splitter=RandomSplitter(),
    n_inp=1,
)


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(proc_df, bs=4)
len(dls.train), len(dls.valid)


(426, 107)

In [None]:
b = dls.one_batch()
len(b), len(b[0]), len(b[1]), len(b[2])


(3, 4, 4, 4)

In [None]:
b[0].keys()
b[0]['special_tokens_mask']

tensor([0, 0, 0, 0], device='cuda:1')

In [None]:
b[0]["input_ids"].shape, b[0]["attention_mask"].shape, b[1].shape, b[2].shape


(torch.Size([4, 128]), torch.Size([4, 128]), torch.Size([4]), torch.Size([4]))

We can see that any additional data is now located in the inputs dictionary

In [None]:
b[0]["id"]

['56bfc281a10cfb14005512b8',
 '56ce0f42aab44d1400b88421',
 '56bed32f3aeaaa14008c94cf',
 '56d4d9a92ccc5a1400d832a6']

In [None]:
dls.show_batch(dataloaders=dls, max_n=4)


Unnamed: 0,text,found,start/end,answer
0,How has L.A. Reid described her? Jim Farber of the Daily News and Stephanie Classen of Star Phoenix both praised her strong voice and her stage presence.,False,"(0, 0)",
1,"On what date did Frédéric give his first performance of Piano Concerto No. 2 in F minor, Op. 21? Back in Warsaw that year, Chopin heard Niccolò Paganini play the violin, and composed a set of variations, Souvenir de Paganini. It may have been this experience which encouraged him to commence writing his first Études, (1829–32), exploring the capacities of his own instrument. On 11 August, three weeks after completing his studies at the Warsaw Conservatory, he made his debut in Vienna. He gave two piano concerts and received many",False,"(0, 0)",
2,"Together how records have they sold? in recent years. Beyoncé suffered a miscarriage in 2010 or 2011, describing it as ""the saddest thing"" she had ever endured. She returned to the studio and wrote music in order to cope with the loss. In April 2011, Beyoncé and Jay Z traveled to Paris in order to shoot the album cover for her 4, and unexpectedly became pregnant in Paris.",False,"(0, 0)",
3,"What is the lead single on Beyoncé's first album?Sung Collaboration for ""Crazy in Love"", and Best R&B Performance by a Duo or Group with Vocals for ""The Closer I Get to You"" with Luther Vandross.",False,"(0, 0)",


## Summary

This module includes all the low, mid, and high-level API bits for extractive Q&A tasks data preparation.

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()


Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted 99e_examples-causal-lm-gpt2.ipynb.
Converted index.ipynb.
