In [None]:
# default_exp data.question_answering


In [None]:
# all_slow


In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.question_answering

> Question/Answering tasks are models that require two text inputs (a context that includes the answer and the question).  The objective is to predict the start/end tokens of the answer in the context). This module contains the bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data for question/answering tasks.

In [None]:
# export
import ast
from functools import reduce

from fastcore.all import *
from fastai.data.block import DataBlock, CategoryBlock, ColReader, ColSplitter
from fastai.imports import *
from fastai.losses import CrossEntropyLossFlat
from fastai.torch_core import *
from fastai.torch_imports import *
from transformers import AutoModelForQuestionAnswering, logging, PretrainedConfig, PreTrainedTokenizerBase, PreTrainedModel

from blurr.utils import BLURR
from blurr.data.core import HF_BaseInput, HF_AfterBatchTransform, HF_BeforeBatchTransform, first_blurr_tfm

logging.set_verbosity_error()


In [None]:
# hide_input
import pdb

from datasets import load_dataset
from fastai.data.core import DataLoader, DataLoaders, TfmdDL
from fastai.data.external import untar_data, URLs
from fastai.data.transforms import *
from fastcore.test import *
from nbverbose.showdoc import show_doc

from blurr.utils import print_versions
from blurr.data.core import HF_TextBlock

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")


What we're running with at the time this documentation was generated:
torch: 1.10.1+cu111
fastai: 2.5.3
transformers: 4.15.0


In [None]:
# hide
# cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")


Using GPU #1: GeForce GTX 1080 Ti


## Setup

We'll use a subset of `squad_v2` to demonstrate how to configure your blurr code for extractive question answering

In [None]:
train_ds = load_dataset("squad_v2", split='train[:1000]')

Reusing dataset squad_v2 (/home/wgilliam/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/ba48bc29b974701e9ba8d80ac94f3e3df924aba41b764dcf9851debea7c672e4)


In [None]:
squad_df = pd.DataFrame(train_ds)

print(len(squad_df))
squad_df.head(2)


1000


Unnamed: 0,answers,context,id,question,title
0,"{'answer_start': [269], 'text': ['in the late 1990s']}","Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five G...",56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,Beyoncé
1,"{'answer_start': [207], 'text': ['singing and dancing']}","Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five G...",56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was growing up?,Beyoncé


In [None]:
model_cls = AutoModelForQuestionAnswering

pretrained_model_name = "roberta-base"  #'xlm-mlm-ende-1024'
hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)

max_seq_len = 128
vocab = dict(enumerate(range(max_seq_len)))


## Utility methods

Starting with version 2.0.0, blurr provides a `find_answer_token_idxs` method that can be used in preprocessing your raw dataset beforehand, or on the fly.  It returns tensors indicating the start/end token indicies and one indicating whether the answer can be found in the provided set of `input_ids`.

In [None]:
# export
def find_answer_token_idxs(ans_data, input_ids, offset_mapping, qst_mask):
    # mask the question tokens so they aren't included in the search
    masked_offset_mapping = offset_mapping.clone()
    masked_offset_mapping[qst_mask] = tensor([-100, -100])

    # based on the character start/end index, see if we can find the span of tokens in the `offset_mapping`
    starts = torch.where((masked_offset_mapping[:, 0] == ans_data[1]) | (masked_offset_mapping[:, 1] == ans_data[1]))[0]
    ends = torch.where((masked_offset_mapping[:, 0] <= ans_data[2]) & (masked_offset_mapping[:, 1] >= ans_data[2]))[0]

    if len(starts) > 0 and len(ends) > 0:
        start, end = starts[-1], ends[-1]
        for s, e in itertools.product(starts, ends):
            txt = hf_tokenizer.decode(input_ids[s:e])
            if txt.strip() == ans_data[0].strip():
                start, end = s, e
                break

        if end < len(masked_offset_mapping):
            return (start, end, tensor(1))

    # if neither star or end is found, or the end token is part of this chunk, consider the answer not found
    return (tensor(0), tensor(0), tensor(0))


In [None]:
show_doc(find_answer_token_idxs)


<h4 id="find_answer_token_idxs" class="doc_header"><code>find_answer_token_idxs</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>find_answer_token_idxs</code>(**`ans_data`**, **`input_ids`**, **`offset_mapping`**, **`qst_mask`**)



**Parameters:**


 - **`ans_data`** : *`<class 'inspect._empty'>`*

 - **`input_ids`** : *`<class 'inspect._empty'>`*

 - **`offset_mapping`** : *`<class 'inspect._empty'>`*

 - **`qst_mask`** : *`<class 'inspect._empty'>`*


## Preprocessing methods

With version 2.0.0 of blurr, we include a generalized preprocessing function you can use for finding the start/end token indicies based on your tokenizer of choice

In [None]:
# export
def pre_process_qa(
    # Your pd.DataFrame
    raw_df,
    # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
    hf_arch: str,
    # A Hugging Face tokenizer
    hf_tokenizer: PreTrainedTokenizerBase,
    # The attribute in your dataset that contains the context (where the answer is included) (default: 'context')
    ctx_attr: str = "context",
    # The attribute in your dataset that contains the question being asked (default: 'question')
    qst_attr: str = "question",
    # The attribute in your dataset that contains the actual answer (default: 'answer_text')
    ans_attr: str = "answer_text",
    # The attribute in your dataset that contains the actual answer (default: 'answer_text')
    ans_start_char_idx: str = "ans_start_char_idx",
    # The attribute in your dataset that contains the actual answer (default: 'answer_text')
    ans_end_char_idx: str = "ans_end_char_idx",
    # Other column data from the raw DataFrame you want to include in the processed DataFrame
    keep_cols: list = [],
    # Any keyword arguments you want your Hugging Face tokenizer to use during tokenization
    tok_kwargs: dict = {"return_overflowing_tokens": True},
):
    df = raw_df.copy()

    # these values are mandatory
    tok_kwargs["return_offsets_mapping"] = True
    tok_kwargs["padding"] = tok_kwargs.get("padding", True)
    tok_kwargs["return_tensors"] = "pt"

    proc_data = []
    for row_idx, row in df.iterrows():
        # fetch data elements required to build a modelable dataset
        context, qst = row[ctx_attr], row[qst_attr]
        ans_text, start_char_idx, end_char_idx = row[ans_attr], row[ans_start_char_idx], row[ans_end_char_idx] + 1
        ans_data = (ans_text, start_char_idx, end_char_idx)

        # shift the question and context appropriately based on the tokenizers padding strategy
        if hf_tokenizer.padding_side == "right":
            tok_kwargs["truncation"] = "only_second"
            tok_d = hf_tokenizer(qst.lstrip(), context, **tok_kwargs)
        else:
            tok_kwargs["truncation"] = "only_first"
            tok_d = hf_tokenizer(context, qst.lstrip(), **tok_kwargs)

        overflow_mapping = tok_d["overflow_to_sample_mapping"] if ("overflow_to_sample_mapping" in tok_d) else [0]

        for idx in overflow_mapping:
            # update the targets: is_found (s[1]), answer start token index (s[2]), and answer end token index (s[3])
            qst_mask = [i != 1 for i in tok_d.sequence_ids(idx)]
            start, end, has_ans = find_answer_token_idxs(ans_data, tok_d["input_ids"][idx], tok_d["offset_mapping"][idx], qst_mask)
            start, end, has_ans = start.item(), end.item(), has_ans.item()

            row_data = [qst, context, ans_text, has_ans, start, end, start_char_idx, end_char_idx]
            row_data += [row[col] for col in keep_cols]

            proc_data.append(row_data)

    proc_df = pd.DataFrame(
        proc_data,
        columns=[
            "question",
            "context",
            "answer_text",
            "has_answer",
            "ans_start_token_idx",
            "ans_end_token_idx",
            "ans_start_char_idx",
            "ans_end_char_idx",
        ]
        + keep_cols,
    )
    return proc_df


In [None]:
show_doc(pre_process_qa)


<h4 id="pre_process_qa" class="doc_header"><code>pre_process_qa</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>pre_process_qa</code>(**`raw_df`**, **`hf_arch`**:`str`, **`hf_tokenizer`**:`PreTrainedTokenizerBase`, **`ctx_attr`**:`str`=*`'context'`*, **`qst_attr`**:`str`=*`'question'`*, **`ans_attr`**:`str`=*`'answer_text'`*, **`ans_start_char_idx`**:`str`=*`'ans_start_char_idx'`*, **`ans_end_char_idx`**:`str`=*`'ans_end_char_idx'`*, **`keep_cols`**:`list`=*`[]`*, **`tok_kwargs`**:`dict`=*`{'return_overflowing_tokens': True}`*)



**Parameters:**


 - **`raw_df`** : *`<class 'inspect._empty'>`*	<p>Your pd.DataFrame</p>


 - **`hf_arch`** : *`<class 'str'>`*	<p>The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)</p>


 - **`hf_tokenizer`** : *`<class 'transformers.tokenization_utils_base.PreTrainedTokenizerBase'>`*	<p>A Hugging Face tokenizer</p>


 - **`ctx_attr`** : *`<class 'str'>`*, *optional*	<p>The attribute in your dataset that contains the context (where the answer is included) (default: 'context')</p>


 - **`qst_attr`** : *`<class 'str'>`*, *optional*	<p>The attribute in your dataset that contains the question being asked (default: 'question')</p>


 - **`ans_attr`** : *`<class 'str'>`*, *optional*	<p>The attribute in your dataset that contains the actual answer (default: 'answer_text')</p>


 - **`ans_start_char_idx`** : *`<class 'str'>`*, *optional*	<p>The attribute in your dataset that contains the actual answer (default: 'answer_text')</p>


 - **`ans_end_char_idx`** : *`<class 'str'>`*, *optional*	<p>The attribute in your dataset that contains the actual answer (default: 'answer_text')</p>


 - **`keep_cols`** : *`<class 'list'>`*, *optional*	<p>Other column data from the raw DataFrame you want to include in the processed DataFrame</p>


 - **`tok_kwargs`** : *`<class 'dict'>`*, *optional*	<p>Any keyword arguments you want your Hugging Face tokenizer to use during tokenization</p>



How to preprocess your data

In [None]:
squad_df["ans_start_char_idx"] = squad_df.answers.apply(lambda v: v["answer_start"][0])
squad_df["answer_text"] = squad_df.answers.apply(lambda v: v["text"][0])
squad_df["ans_end_char_idx"] = squad_df["ans_start_char_idx"].astype(int) + squad_df["answer_text"].str.len()

print(len(squad_df))
squad_df.head(2)


1000


Unnamed: 0,answers,context,id,question,title,ans_start_char_idx,answer_text,ans_end_char_idx
0,"{'answer_start': [269], 'text': ['in the late 1990s']}","Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five G...",56be85543aeaaa14008c9063,When did Beyonce start becoming popular?,Beyoncé,269,in the late 1990s,286
1,"{'answer_start': [207], 'text': ['singing and dancing']}","Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five G...",56be85543aeaaa14008c9065,What areas did Beyonce compete in when she was growing up?,Beyoncé,207,singing and dancing,226


In [None]:
proc_df = pre_process_qa(
    squad_df,
    hf_arch,
    hf_tokenizer,
    keep_cols=["id", "title"],
    tok_kwargs={"return_overflowing_tokens": True, "max_length": max_seq_len, "stride": 2},
)

print(len(proc_df))
proc_df.head(2)


2189


Unnamed: 0,question,context,answer_text,has_answer,ans_start_token_idx,ans_end_token_idx,ans_start_char_idx,ans_end_char_idx,id,title
0,When did Beyonce start becoming popular?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five G...",in the late 1990s,1,84,89,269,287,56be85543aeaaa14008c9063,Beyoncé
1,When did Beyonce start becoming popular?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five G...",in the late 1990s,1,84,89,269,287,56be85543aeaaa14008c9063,Beyoncé


In [None]:
sampled_df = proc_df.sample(n=10)
for row_idx, row in sampled_df.iterrows():
    test_example = row
    tok_d = hf_tokenizer(test_example.question, test_example.context)

    if test_example.has_answer == 1:
        # print(test_example.answer_text.strip())
        # print(hf_tokenizer.decode(tok_d["input_ids"][test_example.ans_start_token_idx : test_example.ans_end_token_idx]).strip())
        test_eq(
            test_example.answer_text,
            hf_tokenizer.decode(tok_d["input_ids"][test_example.ans_start_token_idx : test_example.ans_end_token_idx]).strip(),
        )
    else:
        test_eq(test_example.ans_start_token_idx, 0)
        test_eq(test_example.ans_end_token_idx, 0)


If you want to remove texts longer than your model will hold (and include only answerable contexts)

In [None]:
proc2_df = proc_df[(proc_df.ans_end_token_idx < max_seq_len) & (proc_df.has_answer == 1)]

print(len(proc2_df))
proc2_df.head(2)


1497


Unnamed: 0,question,context,answer_text,has_answer,ans_start_token_idx,ans_end_token_idx,ans_start_char_idx,ans_end_char_idx,id,title
0,When did Beyonce start becoming popular?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five G...",in the late 1990s,1,84,89,269,287,56be85543aeaaa14008c9063,Beyoncé
1,When did Beyonce start becoming popular?,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five G...",in the late 1990s,1,84,89,269,287,56be85543aeaaa14008c9063,Beyoncé


## Mid-level API

### `HF_QuestionAnswerInput`

In [None]:
# export
class HF_QuestionAnswerInput(HF_BaseInput):
    pass


### `HF_QABeforeBatchTransform`

In [None]:
# export
class HF_QABeforeBatchTransform(HF_BeforeBatchTransform):
    def __init__(
        self,
        # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
        hf_arch: str,
        # A specific configuration instance you want to use
        hf_config: PretrainedConfig,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # A Hugging Face model
        hf_model: PreTrainedModel,
        # To control the length of the padding/truncation. It can be an integer or None,
        # in which case it will default to the maximum length the model can accept. If the model has no
        # specific maximum input length, truncation/padding to max_length is deactivated.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        max_length: int = None,
        # To control the `padding` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `'do_not_pad'.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        padding: Union[bool, str] = True,
        # To control `truncation` applied to your `hf_tokenizer` during tokenization. If None, will default to
        # `False` or `do_not_truncate`.
        # See [Everything you always wanted to know about padding and truncation](https://huggingface.co/transformers/preprocessing.html#everything-you-always-wanted-to-know-about-padding-and-truncation)
        truncation: Union[bool, str] = "only_second",
        # The `is_split_into_words` argument applied to your `hf_tokenizer` during tokenization. Set this to `True`
        # if your inputs are pre-tokenized (not numericalized)
        is_split_into_words: bool = False,
        # The token ID that should be ignored when calculating the loss
        ignore_token_id=CrossEntropyLossFlat().ignore_index,
        # Any other keyword arguments you want included when using your `hf_tokenizer` to tokenize your inputs
        tok_kwargs: dict = {},
        # Keyword arguments to apply to `HF_BeforeBatchTransform`
        **kwargs
    ):

        # "return_special_tokens_mask" and "return_offsets_mapping" are mandatory for extractive QA in blurr
        tok_kwargs = { **tok_kwargs, **{"return_special_tokens_mask": True, "return_offsets_mapping": True}}

        super().__init__(
            hf_arch,
            hf_config,
            hf_tokenizer,
            hf_model,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            is_split_into_words=is_split_into_words,
            ignore_token_id = ignore_token_id,
            tok_kwargs=tok_kwargs,
            **kwargs
        )

    def encodes(self, samples):
        samples, batch_encoding = super().encodes(samples, return_batch_encoding=True)

        updated_samples = []
        for idx, s in enumerate(samples):
            # update the targets: is_found (s[1]), answer start token index (s[2]), and answer end token index (s[3])
            qst_mask = [i != 1 for i in batch_encoding.sequence_ids(idx)]
            start, end, has_ans = find_answer_token_idxs(s[1], s[0]["input_ids"], s[0]["offset_mapping"], qst_mask)
            start_t, end_t, has_ans_t = TensorCategory(start), TensorCategory(end), TensorCategory(has_ans)

            # cls_index: location of CLS token (used by xlnet and xlm); is a list.index(value) for pytorch tensor's
            s[0]["cls_index"] = (s[0]["input_ids"] == self.hf_tokenizer.cls_token_id).nonzero()[0]
            # p_mask: mask with 1 for token than cannot be in the answer, else 0 (used by xlnet and xlm)
            s[0]["p_mask"] = s[0]["special_tokens_mask"]

            updated_samples.append((s[0], has_ans_t, start_t, end_t))

        return updated_samples


In [None]:
# export
def get_dummy_token_idx(r):
    return 0


### Example 1: Using a pre-processed dataset

In [None]:
before_batch_tfm = HF_QABeforeBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model, max_length=max_seq_len)

blocks = (
    HF_TextBlock(before_batch_tfm=before_batch_tfm, input_return_type=HF_QuestionAnswerInput),
    None,
    CategoryBlock(vocab=vocab),
    CategoryBlock(vocab=vocab),
)


def get_ans_and_start_end_char_idxs(r):
    start = r.answers["answer_start"][0]
    answer_text = r.answers["text"][0]
    end = start + len(answer_text) + 1
    return (answer_text, start, end)


dblock = DataBlock(
    blocks=blocks,
    get_x=lambda x: (x.question, x.context),
    get_y=[get_ans_and_start_end_char_idxs, get_dummy_token_idx, get_dummy_token_idx],
    splitter=RandomSplitter(),
    n_inp=1,
)


In [None]:
dls = dblock.dataloaders(squad_df, bs=4)
len(dls.train), len(dls.valid)


(200, 50)

In [None]:
b = dls.one_batch()
len(b), len(b[0]), len(b[1]), len(b[2])


(4, 6, 4, 4)

In [None]:
b[0]["input_ids"].shape, b[0]["attention_mask"].shape, b[1].shape, b[2].shape


(torch.Size([4, 128]), torch.Size([4, 128]), torch.Size([4]), torch.Size([4]))

In [None]:
# export
@typedispatch
def show_batch(
    # This typedispatched `show_batch` will be called for `HF_QuestionAnswerInput` typed inputs
    x: HF_QuestionAnswerInput,
    # Your targets
    y,
    # Your raw inputs/targets
    samples,
    # Your `DataLoaders`. This is required so as to get at the Hugging Face objects for
    # decoding them into something understandable
    dataloaders,
    # Your `show_batch` context
    ctxs=None,
    # The maximum number of items to show
    max_n=6,
    # Any truncation your want applied to your decoded inputs
    trunc_at=None,
    # Any other keyword arguments you want applied to `show_batch`
    **kwargs
):
    # grab our tokenizer
    tfm = first_blurr_tfm(dataloaders, HF_QABeforeBatchTransform)
    hf_tokenizer = tfm.hf_tokenizer

    res = L()
    for sample, input_ids, has_ans, start, end in zip(samples, x, *y):
        txt = hf_tokenizer.decode(sample[0], skip_special_tokens=True)[:trunc_at]
        found = has_ans.item() == 1
        ans_text = hf_tokenizer.decode(input_ids[start:end], skip_special_tokens=False)
        res.append((txt, found, (start.item(), end.item()), ans_text))

    display_df(pd.DataFrame(res, columns=["text", "found", "start/end", "answer"])[:max_n])
    return ctxs


The `show_batch` method above allows us to create a more interpretable view of our question/answer data.

In [None]:
dls.show_batch(dataloaders=dls, max_n=4, trunc_at=500)


Unnamed: 0,text,found,start/end,answer
0,"Where did Beyoncé exclusively release her single, Formation? On February 6, 2016, one day before her performance at the Super Bowl, Beyoncé released a new single exclusively on music streaming service Tidal called ""Formation"".",True,"(41, 43)",Tidal
1,"In what year did Frédéric officially acquire French citizenship? Chopin arrived in Paris in late September 1831; he would never return to Poland, thus becoming one of many expatriates of the Polish Great Emigration. In France he used the French versions of his given names, and after receiving French citizenship in 1835, he travelled on a French passport. However, Chopin remained close to his fellow Poles in exile as friends and confidants and he never felt fully comfortable speaking French. Cho",True,"(68, 70)",1835
2,"What did Beyonce's Fashion Diva feature? In 2005, Beyoncé teamed up with House of Brands, a shoe company, to produce a range of footwear for House of Deréon. In January 2008, Starwave Mobile launched Beyoncé Fashion Diva, a ""high-style"" mobile game with a social networking component, featuring the House of Deréon collection. In July 2009, Beyoncé and her mother launched a new junior apparel label, Sasha Fierce for Deréon, for back-to-school selling. The collection included sportswear, outerwea",True,"(73, 79)",House of Deréon collection
3,"How many Grammys has Beyoncé won? Beyoncé has won 20 Grammy Awards, both as a solo artist and member of Destiny's Child, making her the second most honored female artist by the Grammys, behind Alison Krauss and the most nominated woman in Grammy Award history with 52 nominations. ""Single Ladies (Put a Ring on It)"" won Song of the Year in 2010 while ""Say My Name"" and ""Crazy in Love"" had previously won Best R&B Song. Dangerously in Love, B'Day and I Am... Sasha Fierce have all won Best Contempora",True,"(16, 17)",20


In [None]:
# hide
for idx, b in enumerate(dls.valid):
    pass
    # for input_ids, start_idx, end_idx in zip(b[0]["input_ids"], b[2], b[3]):
    #     print(hf_tokenizer.decode(input_ids))
    #     if start_idx.item() != 0:
    #         print(f"*** ANSWER: {hf_tokenizer.decode(input_ids[start_idx:end_idx])} ***\n")
    #     else:
    #         print("*** NO ANSWER ***")

    # if (idx == 1):
    #     break

print(idx)


49


### Example 2: Batch-time targets

In [None]:
before_batch_tfm = HF_QABeforeBatchTransform(
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    max_length=max_seq_len,
    padding="max_length",
    tok_kwargs={"return_overflowing_tokens": True, "stride": 2},
)

blocks = (
    HF_TextBlock(before_batch_tfm=before_batch_tfm, input_return_type=HF_QuestionAnswerInput),
    None,
    CategoryBlock(vocab=vocab),
    CategoryBlock(vocab=vocab),
)


def get_ans_and_start_end_char_idxs(r):
    start = r.answers["answer_start"][0]
    answer_text = r.answers["text"][0]
    end = start + len(answer_text) + 1
    return (answer_text, start, end)


dblock = DataBlock(
    blocks=blocks,
    get_x=lambda x: (x.question, x.context),
    get_y=[get_ans_and_start_end_char_idxs, get_dummy_token_idx, get_dummy_token_idx],
    splitter=RandomSplitter(),
    n_inp=1,
)


In [None]:
dls = dblock.dataloaders(squad_df, bs=4)
len(dls.train), len(dls.valid)


(200, 50)

In [None]:
b = dls.one_batch()
len(b), len(b[0]), len(b[1]), len(b[2])


(4, 7, 4, 4)

In [None]:
b[0]["input_ids"].shape, b[0]["attention_mask"].shape, b[1].shape, b[2].shape


(torch.Size([4, 128]), torch.Size([4, 128]), torch.Size([4]), torch.Size([4]))

In [None]:
dls.show_batch(dataloaders=dls, max_n=4, trunc_at=500)


Unnamed: 0,text,found,start/end,answer
0,"Which prominent star felt the 2009 Female Video of the Year award should have went to Beyoncé instead of Taylor Swift?, Beyoncé embarked on the I Am... World Tour, her second headlining worldwide concert tour, consisting of 108 shows, grossing $119.5 million.",False,"(0, 0)",
1,"In 2009, Beyonce started her second world tour and grossed how much money? On April 4, 2008, Beyoncé married Jay Z. She publicly revealed their marriage in a video montage at the listening party for her third studio album, I Am... Sasha Fierce, in Manhattan's Sony Club on October 22, 2008. I Am... Sasha Fierce was released on November 18, 2008 in the United States. The album formally introduces Beyoncé's alter ego Sasha Fierce, conceived during the making of her 2003 single ""Crazy in Love"", sel",False,"(0, 0)",
2,"In 2009, Beyonce started her second world tour and grossed how much money?, debuting atop the Billboard 200, and giving Beyoncé her third consecutive number-one album in the US. The album featured the number-one song ""Single Ladies (Put a Ring on It)"" and the top-five songs ""If I Were a Boy"" and ""Halo"". Achieving the accomplishment of becoming her longest-running Hot 100 single in her career, ""Halo""'s success in the US helped Beyoncé attain more top-ten singles on the list than any other woman",False,"(0, 0)",
3,"In 2009, Beyonce started her second world tour and grossed how much money?s. It also included the successful ""Sweet Dreams"", and singles ""Diva"", ""Ego"", ""Broken-Hearted Girl"" and ""Video Phone"". The music video for ""Single Ladies"" has been parodied and imitated around the world, spawning the ""first major dance craze"" of the Internet age according to the Toronto Star. The video has won several awards, including Best Video at the 2009 MTV Europe Music Awards, the 2009 Scottish MOBO Awards, and the",False,"(0, 0)",


In [None]:
# hide
for idx, b in enumerate(dls.valid):
    pass
    # for input_ids, start_idx, end_idx in zip(b[0]["input_ids"], b[2], b[3]):
    #     print(hf_tokenizer.decode(input_ids))
    #     if start_idx.item() != 0:
    #         print(f"*** ANSWER: {hf_tokenizer.decode(input_ids[start_idx:end_idx])} ***\n")
    #     else:
    #         print("*** NO ANSWER ***")

    # if (idx == 1):
    #     break

print(idx)


103


## Summary

This module includes all the low, mid, and high-level API bits for extractive Q&A tasks data preparation.

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()


Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted 99e_examples-causal-lm-gpt2.ipynb.
Converted index.ipynb.
