In [None]:
# default_exp data.seq2seq.summarization


In [None]:
# all_slow


In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.seq2seq.summarization

> This module contains the bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data for summarization tasks using architectures like BART and T5. Summarization tasks attempt to generate a human-understandable and sensible representation of a larger body of text (e.g., capture the meaning of a larger document in 1-3 sentences).

In [None]:
# export
from typing import Optional

import numpy as np
import pandas as pd

from datasets import Dataset
from fastai.data.block import DataBlock
from transformers import AutoModelForSeq2SeqLM, PreTrainedTokenizerBase, logging

from blurr.utils import BLURR
from blurr.data.seq2seq.core import Seq2SeqBatchTokenizeTransform, Seq2SeqPreprocessor, Seq2SeqTextBlock

logging.set_verbosity_error()


In [None]:
# hide_input
import os, ast, pdb
from functools import reduce

from datasets import load_dataset
from fastai.data.transforms import *
from fastai.torch_core import *
from fastai.torch_imports import *
from fastcore.all import *
from fastcore.test import *
from nbdev.showdoc import show_doc
from transformers import BartForConditionalGeneration

from blurr.utils import print_versions

os.environ["TOKENIZERS_PARALLELISM"] = "false"
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")


What we're running with at the time this documentation was generated:
torch: 1.10.1+cu111
fastai: 2.5.3
transformers: 4.16.2


In [None]:
# hide
# cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")


Using GPU #1: GeForce GTX 1080 Ti


## Setup

We'll use a subset of `cnn_dailymail` to demonstrate how to configure your BLURR for summarization tasks

In [None]:
raw_datasets = load_dataset("cnn_dailymail", "3.0.0", split=["train", "validation"])
raw_datasets

Reusing dataset cnn_dailymail (/home/wgilliam/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234)


  0%|          | 0/2 [00:00<?, ?it/s]

[Dataset({
     features: ['article', 'highlights', 'id'],
     num_rows: 287113
 }),
 Dataset({
     features: ['article', 'highlights', 'id'],
     num_rows: 13368
 })]

In [None]:
print(raw_datasets[0][0].keys())
print(raw_datasets[0][0]["highlights"])

print(raw_datasets[1][0].keys())
print(raw_datasets[1][0]["highlights"])


dict_keys(['article', 'highlights', 'id'])
Syrian official: Obama climbed to the top of the tree, "doesn't know how to get down"
Obama sends a letter to the heads of the House and Senate .
Obama to seek congressional approval on military action against Syria .
Aim is to determine whether CW were used, not by whom, says U.N. spokesman .
dict_keys(['article', 'highlights', 'id'])
Accident happens in Santa Ynez, California, near where Crosby lives .
The jogger suffered multiple fractures; his injuries are not believed to be life-threatening .


In [None]:
raw_train_ds = raw_datasets[0].shuffle(seed=42).select(range(1000))
raw_valid_ds = raw_datasets[1].shuffle(seed=42).select(range(200))

len(raw_train_ds) + len(raw_valid_ds)

Loading cached shuffled indices for dataset at /home/wgilliam/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234/cache-516bef66c83f0d37.arrow
Loading cached shuffled indices for dataset at /home/wgilliam/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/3cb851bf7cf5826e45d49db2863f627cba583cbc32342df7349dfe6c38060234/cache-e7e93c0052828394.arrow


1200

In [None]:
raw_train_df = pd.DataFrame(raw_train_ds)
raw_valid_df = pd.DataFrame(raw_valid_ds)

raw_train_df.head(2)

Unnamed: 0,article,highlights,id
0,"A protester in Ferguson was arrested during a demonstration on Thursday night - and live-tweeted her entire experience. Brittany Ferrell, a nursing student at the University of Missouri-Saint Louis, was one of 13 people detained by officers in the conflicted Missouri city for 'noise disruption'. The detention has sparked an investigation by the American Civil Liberties Union as lawyers accuse officers of overstretching their powers. Scroll down for video . Arrested: This is Brittany Ferrell, the nursing student and protester who live-tweeted her arrest in Ferguson . Tweeting in handcuffs, ...","Brittany Ferrell, nursing student, was arrested with 12 people on Thursday .\nThey were calling on police take responsibility for Michael Brown's death .\nMs Ferrell tweeted as she was arrested, piled in a small wagon with 7 others .\nThey were accused of 'noise disruption', put in orange jumpsuits and cuffed .\nOfficers now being investigated, lawyers claim they 'overstretched powers'",1e01f238418c31d4e9093f6334e0232babeb639a
1,"A day after confirming it had lost the ability to display Instagram images, Twitter has rolled out its own library of retro filters for its Android and iPhone apps. The eight filters are the usual suspects we've come to expect from mobile photo apps, including desaturated, black and white and high contrast. There are auto-adjust and cropping options, as well as a helpful grid view that lets you see what each filter will look like at once. ""The latest versions of Twitter for iPhone and Twitter for Android introduce a few new ways to enhance the images you tweet,"" said Twitter senior designe...",Twitter has added photo filters to its Android and iOS mobile apps .\nThe addition will help Twitter compete against Facebook-owned Instagram .\nThis is the first time the social network has offered image editing tools .,6f89645bff243fe9ce2a0509e5ca01912abf0d10


In [None]:
pretrained_model_name = "facebook/bart-large-cnn"
model_cls = AutoModelForSeq2SeqLM

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)
hf_arch, type(hf_tokenizer), type(hf_config), type(hf_model)


('bart',
 transformers.models.bart.tokenization_bart_fast.BartTokenizerFast,
 transformers.models.bart.configuration_bart.BartConfig,
 transformers.models.bart.modeling_bart.BartForConditionalGeneration)

## Preprocessing

Starting with version 2.0, BLURR provides a preprocessing base class that can be used to build task specific pre-processed datasets from pandas DataFrames or Hugging Face Datasets

### `SummarizationPreprocessor`

Starting with version 2.0, BLURR provides a sequence classification preprocessing class that can be used to preprocess DataFrames or Hugging Face Datasets.

This class can be used for preprocessing summarization tasks, and includes a `proc_{your_text_attr}` and `proc_{target_text_attr}` attributes containing your modified input and target texts as a result of tokenization (e.g., if you specify a `max_length` the `proc_{your_text_attr}` may contain truncated text). 

In [None]:
# export
class SummarizationPreprocessor(Seq2SeqPreprocessor):
    def __init__(
        self,
        # A Hugging Face tokenizer
        hf_tokenizer: PreTrainedTokenizerBase,
        # The number of examples to process at a time
        batch_size: int = 1000,
        # The unique identifier in the dataset
        id_attr: Optional[str] = None,
        # The attribute holding the text
        text_attr: str = "text",
        # The maximum length (# of tokens) allowed for inputs. Will default to the max length allowed
        # by the model if not provided
        max_input_tok_length: Optional[int] = None,
        # The attribute holding the summary
        target_text_attr: str = "summary",
        # The maximum length (# of tokens) allowed for targets
        max_target_tok_length: Optional[int] = None,
        # If not "None", any examples where "target_text_attr" is < "min_summary_char_length" will be removed
        min_summary_char_length: Optional[int] = None,
        # The attribute that should be created if your are processing individual training and validation
        # datasets into a single dataset, and will indicate to which each example is associated
        is_valid_attr: Optional[str] = "is_valid",
        # Tokenization kwargs that will be applied with calling the tokenizer
        tok_kwargs: dict = {},
    ):
        # we need to use the offset mappings to get back at the raw text from its tokenized representation
        tok_kwargs = {**tok_kwargs, "return_offsets_mapping": True}
        
        super().__init__(
            hf_tokenizer, batch_size, text_attr, max_input_tok_length, target_text_attr, max_target_tok_length, is_valid_attr, tok_kwargs
        )

        self.id_attr = id_attr
        self.min_summary_char_length = min_summary_char_length

    def process_df(self, training_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None):
        df = super().process_df(training_df, validation_df)

        # process df in mini-batches
        final_df = pd.DataFrame()
        for g, batch_df in df.groupby(np.arange(len(df)) // self.batch_size):
            final_df = final_df.append(self._process_df_batch(batch_df))

        final_df.reset_index(drop=True, inplace=True)
        return final_df

    def process_hf_dataset(self, training_ds: Dataset, validation_ds: Optional[Dataset] = None):
        ds = super().process_hf_dataset(training_ds, validation_ds)
        return Dataset.from_pandas(self.process_df(pd.DataFrame(ds)))

    # ----- utility methods -----
    def _process_df_batch(self, batch_df):
        # remove summaries that are too short if a min character length is specified
        if self.min_summary_char_length:
            batch_df = batch_df[batch_df[self.target_text_attr].str.len() >= self.min_summary_char_length]

        batch_df.reset_index(drop=True, inplace=True)

        # grab our inputs and targets batch encoding objects
        inputs, targets = self._tokenize_function(batch_df.to_dict(orient="list"))

        # add are processed text and target texts to the batched DataFrame
        for txt_seq_idx, (txt_attr, batch_enc) in enumerate(zip([self.text_attr, self.target_text_attr], [inputs, targets])):
            if txt_attr is None:
                break

            char_idxs = []
            for idx, offset_mapping in enumerate(batch_enc["offset_mapping"]):
                text_offsets = [offset_mapping[i] for i, seq_id in enumerate(batch_enc.sequence_ids(idx))]
                char_idxs.append([min(text_offsets)[0], max(text_offsets)[1]])

            batch_df = pd.concat(
                [batch_df, pd.DataFrame(char_idxs, columns=[f"{txt_attr}_start_char_idx", f"{txt_attr}_end_char_idx"])], axis=1
            )
            batch_df.insert(
                0,
                f"proc_{txt_attr}",
                batch_df.apply(lambda r: r[txt_attr][r[f"{txt_attr}_start_char_idx"] : r[f"{txt_attr}_end_char_idx"] + 1], axis=1),
            )

        return batch_df


#### Using a `DataFrame`

In [None]:
preprocessor = SummarizationPreprocessor(
    hf_tokenizer,
    id_attr="id",
    text_attr="article",
    target_text_attr="highlights",
    max_input_tok_length=128,
    max_target_tok_length=30,
    min_summary_char_length=10,
)
proc_df = preprocessor.process_df(raw_train_df, raw_valid_df)
proc_df.columns, len(proc_df)
proc_df.head(2)


Unnamed: 0,proc_highlights,proc_article,article,highlights,id,is_valid,article_start_char_idx,article_end_char_idx,highlights_start_char_idx,highlights_end_char_idx
0,"Brittany Ferrell, nursing student, was arrested with 12 people on Thursday .\nThey were calling on police take responsibility for Michael Brown's death","A protester in Ferguson was arrested during a demonstration on Thursday night - and live-tweeted her entire experience. Brittany Ferrell, a nursing student at the University of Missouri-Saint Louis, was one of 13 people detained by officers in the conflicted Missouri city for 'noise disruption'. The detention has sparked an investigation by the American Civil Liberties Union as lawyers accuse officers of overstretching their powers. Scroll down for video . Arrested: This is Brittany Ferrell, the nursing student and protester who live-tweeted her arrest in Ferguson . Tweeting in handcuffs, ...","A protester in Ferguson was arrested during a demonstration on Thursday night - and live-tweeted her entire experience. Brittany Ferrell, a nursing student at the University of Missouri-Saint Louis, was one of 13 people detained by officers in the conflicted Missouri city for 'noise disruption'. The detention has sparked an investigation by the American Civil Liberties Union as lawyers accuse officers of overstretching their powers. Scroll down for video . Arrested: This is Brittany Ferrell, the nursing student and protester who live-tweeted her arrest in Ferguson . Tweeting in handcuffs, ...","Brittany Ferrell, nursing student, was arrested with 12 people on Thursday .\nThey were calling on police take responsibility for Michael Brown's death .\nMs Ferrell tweeted as she was arrested, piled in a small wagon with 7 others .\nThey were accused of 'noise disruption', put in orange jumpsuits and cuffed .\nOfficers now being investigated, lawyers claim they 'overstretched powers'",1e01f238418c31d4e9093f6334e0232babeb639a,False,0,648,0,150
1,Twitter has added photo filters to its Android and iOS mobile apps .\nThe addition will help Twitter compete against Facebook-owned Instagram .\nThis,"A day after confirming it had lost the ability to display Instagram images, Twitter has rolled out its own library of retro filters for its Android and iPhone apps. The eight filters are the usual suspects we've come to expect from mobile photo apps, including desaturated, black and white and high contrast. There are auto-adjust and cropping options, as well as a helpful grid view that lets you see what each filter will look like at once. ""The latest versions of Twitter for iPhone and Twitter for Android introduce a few new ways to enhance the images you tweet,"" said Twitter senior designe...","A day after confirming it had lost the ability to display Instagram images, Twitter has rolled out its own library of retro filters for its Android and iPhone apps. The eight filters are the usual suspects we've come to expect from mobile photo apps, including desaturated, black and white and high contrast. There are auto-adjust and cropping options, as well as a helpful grid view that lets you see what each filter will look like at once. ""The latest versions of Twitter for iPhone and Twitter for Android introduce a few new ways to enhance the images you tweet,"" said Twitter senior designe...",Twitter has added photo filters to its Android and iOS mobile apps .\nThe addition will help Twitter compete against Facebook-owned Instagram .\nThis is the first time the social network has offered image editing tools .,6f89645bff243fe9ce2a0509e5ca01912abf0d10,False,0,635,0,147


## Examples

### Using the mid-level API

#### Batch-Time Tokenization

##### Step 1: Get your Hugging Face objects.

In [None]:
pretrained_model_name = "facebook/bart-large-cnn"
model_cls = AutoModelForSeq2SeqLM

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)

#####  Step 2: Create your `DataBlock`

Two lines!  Notice we pass in `noop` for our targets (e.g. our summaries) because the batch transform will take care of both out inputs and targets.

In [None]:
blocks = (Seq2SeqTextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), noop)
dblock = DataBlock(blocks=blocks, get_x=ColReader("article"), get_y=ColReader("highlights"), splitter=RandomSplitter())


In [None]:
# dblock.summary(cnndm_df)


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(raw_train_df, bs=4)


In [None]:
b = dls.one_batch()


In [None]:
len(b), b[0]["input_ids"].shape, b[0]["labels"].shape, b[1].shape


(2, torch.Size([4, 1024]), torch.Size([4, 72]), torch.Size([4, 72]))

In [None]:
b[0]["labels"][0], b[1][0]

(tensor([    0,   270,  3905,  2950,   516,     9,   908,    25,    37,  5586,
           940,  2355,   375,   479, 50118,  9167,   703,    15,     5,   276,
           183,  1284,  2922, 11137,  4457,    30,   299,   940,  2355,  3504,
            11,   188,   469,   479,     2,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100], device='cuda:1'),
 tensor([    0,   270,  3905,  2950,   516,     9,   908,    25,    37,  5586,
           940,  2355,   375,   479, 50118,  9167,   703,    15,     5,   276,
           183,  1284,  2922, 11137,  4457,    30,   299,   940,  2355,  3504,
            11,   188,   469,   479,     2,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, input_trunc_at=1000, target_trunc_at=250)


Unnamed: 0,text,target
0,"<s> By. Daily Mail Reporter. PUBLISHED:. 08:16 EST, 14 May 2012. |. UPDATED:. 22:07 EST, 14 May 2012. Barack Obama's latest campaign gambit follows a familiar line of attack as it uses Mitt Romney's private equity past to cast the Republican candidate as greedy, job-killing corporate titan with little concern for the working class. The President is not the first of Mr Romney's opponents to try and paint the former governor of Massachusetts as a heartless uber-capitalist - even his Republican rivals used the same tactic during the heated primary battle. But Mr Obama's campaign seems to have been particularly unoriginal - as his attack ad is almost identical to one produced by Ted Kennedy for his Senate campaign against Mr Romney in 1994, featuring unemployed workers complaining about Bain Capital, the firm founded by Mr Romney. The timing of the Obama assault on private equity is also unfortunate, as on Monday night the President attended a fundraiser hosted by Democratic supporter Ham",President follows familiar line of attack as he highlights private equity past.\nAd released on the same day Obama attended fundraiser hosted by top private equity boss in New York.
1,"<s> (CNN) -- President Barack Obama and GOP vice presidential nominee Paul Ryan on Friday traded sharp criticism over health care reforms and Medicare, with each telling a leading advocacy group for senior citizens that the other was being untruthful. ""Contrary to what you've heard and what you may hear from subsequent speakers, Obamacare actually strengthened Medicare,"" the president told the AARP Liffe@50+ event, using the nickname for the 2010 Affordable Care Act that passed with no Republican support. In particular, he called the claim by Ryan and other Republicans that $716 billion is being cut from Medicare to fund the health care bill ""simply not true."" Ryan spoke to the same event shortly afterward, saying that Obama's contention that the health care law strengthened Medicare was ""just not true,"" adding that the legislation ""turned Medicare into a piggy bank for Obamacare."" The debate over Medicare is a major issue in the November election campaign, especially in the vital batt",NEW: Sen. Reid says Romney still isn't coming clean on taxes.\nPaul Ryan gets mixed reception at AARP event.\nRyan and President Obama accuse each other of being untruthful.\nObama says leadership means rejecting bad ideas.


#### Using a preprocessed dataset

##### Step 1a: Get your Hugging Face objects.

In [None]:
pretrained_model_name = "facebook/bart-large-cnn"
model_cls = AutoModelForSeq2SeqLM

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(pretrained_model_name, model_cls=model_cls)

##### Step 1b. Preprocess dataset

In [None]:
preprocessor = SummarizationPreprocessor(
    hf_tokenizer,
    id_attr="id",
    text_attr="article",
    target_text_attr="highlights",
    max_input_tok_length=128,
    max_target_tok_length=30,
    min_summary_char_length=10,
)
proc_df = preprocessor.process_df(raw_train_df, raw_valid_df)


##### Step 2: Create your `DataBlock`

In [None]:
blocks = (Seq2SeqTextBlock(hf_arch, hf_config, hf_tokenizer, hf_model), noop)
dblock = DataBlock(blocks=blocks, get_x=ColReader("proc_article"), get_y=ColReader("proc_highlights"), splitter=ColSplitter())


##### Step 3: Build your `DataLoaders`

In [None]:
dls = dblock.dataloaders(proc_df, bs=4)

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, trunc_at=500)

Unnamed: 0,text,target
0,"<s> Washington (CNN) -- A post-mortem Sunday of the mid-term elections provided little evidence that Democrats and Republicans will work together to address major issues such as deficit reduction any better than they have in recent years. Republicans interviewed on talk shows promised congressional investigations, an all-out effort to repeal health care reform, and steadfast opposition to any form of higher taxes. Democrats, meanwhile, said the losses they suffered in the congressional elections reflected voter dissatisfaction with lingering high unemployment in the slow recovery from economic recession, rather than an outright repudiation of their policies. Republicans won more than 60 seats formerly held by Democrats to take majority control of </s>","GOP targets health care reform, government spending.\n""Are we willing to work with him?"" Cantor says of President Obama.\nObama says"
1,"<s> Washington (CNN) -- President Barack Obama will deploy up to 1,200 more National Guard troops to the U.S. border with Mexico, an administration official told CNN on Tuesday. In addition, Obama will request $500 million to supplement current spending for enhanced border protection and law enforcement activities, the official said. The National Guard troops will help with drug enforcement efforts and intelligence efforts until Customs and Border Protection can recruit and train additional officers and agents to serve on the border, the official said. The news followed Obama's lunch meeting with Senate Republicans, where Sen. John McCain of Arizona raised the issue of increased border security. McCain </s>","NEW: Mexico says additional U.S. troops at border should battle organized crime, not illegal immigration.\nObama to send 1,200"


## Tests

The purpose of the following tests is to ensure as much as possible, that the core DataBlock code above works for the pretrained **summarization models** below.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained summarization models you are working with ... and if any of your pretrained summarization models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
[model_type for model_type in BLURR.get_models(task="ConditionalGeneration") if (not model_type.startswith("TF"))]


['BartForConditionalGeneration',
 'BigBirdPegasusForConditionalGeneration',
 'BlenderbotForConditionalGeneration',
 'BlenderbotSmallForConditionalGeneration',
 'FSMTForConditionalGeneration',
 'LEDForConditionalGeneration',
 'M2M100ForConditionalGeneration',
 'MBartForConditionalGeneration',
 'MT5ForConditionalGeneration',
 'PegasusForConditionalGeneration',
 'ProphetNetForConditionalGeneration',
 'Speech2TextForConditionalGeneration',
 'T5ForConditionalGeneration',
 'XLMProphetNetForConditionalGeneration']

In [None]:
pretrained_model_names = [
    "facebook/bart-base",
    "facebook/blenderbot_small-90M",
    "allenai/led-base-16384",
    "google/mt5-small",
    "google/pegasus-cnn_dailymail",
    "t5-small",
    "microsoft/prophetnet-large-uncased",
    "microsoft/xprophetnet-large-wiki100-cased",  # XLMProphetNet
]


In [None]:
path = Path("./")
cnndm_df = pd.read_csv(path / "cnndm_sample.csv")


In [None]:
# slow
# hide_output
model_cls = AutoModelForSeq2SeqLM
bsz = 2
seq_sz = 256
trg_seq_sz = 40

test_results = []
for model_name in pretrained_model_names:
    error = None

    print(f"=== {model_name} ===\n")

    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR.get_hf_objects(model_name, model_cls=model_cls)
    print(f"architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n")

    # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
    if hf_tokenizer.pad_token is None:
        hf_tokenizer.add_special_tokens({"pad_token": "<pad>"})
        hf_config.pad_token_id = hf_tokenizer.get_vocab()["<pad>"]
        hf_model.resize_token_embeddings(len(hf_tokenizer))

    batch_tokenize_tfm = Seq2SeqBatchTokenizeTransform(
        hf_arch, hf_config, hf_tokenizer, hf_model, padding="max_length", max_length=seq_sz, max_target_length=trg_seq_sz
    )

    def add_t5_prefix(inp):
        return f"summarize: {inp}" if (hf_arch == "t5") else inp

    blocks = (Seq2SeqTextBlock(batch_tokenize_tfm=batch_tokenize_tfm), noop)
    dblock = DataBlock(
        blocks=blocks, get_x=Pipeline([ColReader("article"), add_t5_prefix]), get_y=ColReader("highlights"), splitter=RandomSplitter()
    )

    dls = dblock.dataloaders(cnndm_df, bs=bsz)
    b = dls.one_batch()

    try:
        print("*** TESTING DataLoaders ***\n")
        test_eq(len(b), 2)
        test_eq(len(b[0]["input_ids"]), bsz)
        test_eq(b[0]["input_ids"].shape, torch.Size([bsz, seq_sz]))
        test_eq(len(b[1]), bsz)
        test_eq(b[1].shape, torch.Size([bsz, trg_seq_sz]))

        if hasattr(hf_tokenizer, "add_prefix_space") and hf_arch not in ["led"]:
            test_eq(hf_tokenizer.add_prefix_space, True)

        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "PASSED", ""))
        dls.show_batch(dataloaders=dls, max_n=2, input_trunc_at=1000)

    except Exception as err:
        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, "FAILED", err))


=== facebook/bart-base ===

architecture:	bart
tokenizer:	BartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"<s> Dan Condon believes in recycling. Just not when it comes to his hotel towels. Condon composts when he's at home in Boulder, Colorado. He eats local, organic and fair-trade food and drives a Honda CR-Z hybrid sports car. You might call him green. Except he's not so green when he travels for his work at an education nonprofit and stays in a hotel, which happens about 10 weeks per year. There, he uses a new towel every day. And don't try to bribe him with a drink or dessert coupon to get him to reuse the same one. ""I could care less about rewards for environmentally conscious behavior unless it's miles,"" Condon wrote in an e-mail. If hotels can't convince a hybrid-driving recycling enthusiast like Condon to go green while traveling, how can they possibly convince everyone else? 9 glamorous movie-star hotels. That's the problem of hotels trying to ""green"" your hotel stay. After guests have paid a pretty penny for a night at the inn, even the most environmental guests may want to treat","Hotel guests who ""go green"" are happier with their stay.\nIncreasing water and energy costs are pushing hotels to cut costs wherever they can.\nMany hotels find that guests don't"
1,"<s> Washington (CNN)Almost immediately following the news of the first terrorist attacks that eventually killed 17 people across France, the global community united around a Twitter hashtag ""Je suis Charlie"" and just days later foreign leaders linked arms with their French counterparts to lead a historic million-person strong rally. Meanwhile, explosives strapped to a girl who appeared to be about 10-years-old detonated on Saturday, killing at least 20 people, in a country whose encounters with terrorism were also punctuated by a hashtag -- this time ""#BringBackOurGirls"" of Nigeria. Boko Haram militants killed as many as 2,000 people, mostly civilians,in a massacre that started the weekend before the terror attack on Charlie Hedbo in downtown Paris. Both the attacks in Nigeria and those in Paris are shocking and horrifying in their own respects, and yet one fomented an unprecedented international reaction -- a popular show of force that rivaled even the reaction to 9/11 -- while the re","France and Nigeria experienced waves of terrorism during the first weeks of 2015.\nWhile the terror attacks in Paris sparked international unified outrage, reaction to Nigeria was more muted.\nSymbolism,"


=== facebook/blenderbot_small-90M ===

architecture:	blenderbot_small
tokenizer:	BlenderbotSmallTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"__unk__ cnn ) __unk__ - home to up to 10 percent of all known species, mexico is recognized as one of the most biodiverse regions on the planet. the twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. and there is a great deal to lose. in the united nations environment program __unk__ unep ) world conservation monitoring centre's list of megadiverse countries mexico ranks 11th. the list represents a group of 17 countries that harbor the majority of the earth's species and are therefore considered extremely biodiverse. from its coral reefs in the caribbean sea to its tropical jungles in chiapas and the yucatan peninsula and its de__unk__ and prairies in the north, mexico boasts an incredibly rich variety of flora and fauna. some 574 out of 717 reptile species found in mexico __unk__ - the most in any country __unk__ - can only be encountered within its borders. it is home to 502 types of ma","mexico hosts to up to 10 percent of all known species on earth. __newln__ it is home to 502 types of mammals, 290 bird species and 26 000 types of plants. __newln__ human development"
1,"it's no secret that a battle has boiled over in the republican party. the fight has played out in the policy arena but also on the campaign trail. and since the inception of the tea party in 2009, it seemed like that wing had the upper hand. it slowly made effective inroads into a party many members of the vocal new group thought had lost its way. they elected a new breed of republican into office, including texas sen. ted cruz and kentucky sen. rand paul, who surprised the political world by defeating establishment__unk__ backed candidates in their respective primaries. but those two successes haven't been the norm, especially in the senate, as many inexperienced but ideologically more pure candidates have been unable to seal the deal. in 2010, sharron angle won the senate primary in nevada and christine o'donnell won in delaware. two years later, richard mourdock and todd akin won in indiana and missouri respectively. all four went on to lose against the democrat. in a year in which","republicans are taking an aggressive stance against intraparty opposition. __newln__ one conservative groups called mississippi republican incumbent a liberal"" __newln__ establishment trying to prevent candidates who can't win in general election"


=== allenai/led-base-16384 ===

architecture:	led
tokenizer:	LEDTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"<s>(CNN) -- Home to up to 10 percent of all known species, Mexico is recognized as one of the most biodiverse regions on the planet. The twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. And there is a great deal to lose. In the United Nations Environment Program (UNEP) World Conservation Monitoring Centre's list of megadiverse countries Mexico ranks 11th. The list represents a group of 17 countries that harbor the majority of the Earth's species and are therefore considered extremely biodiverse. From its coral reefs in the Caribbean Sea to its tropical jungles in Chiapas and the Yucatan peninsula and its deserts and prairies in the north, Mexico boasts an incredibly rich variety of flora and fauna. Some 574 out of 717 reptile species found in Mexico -- the most in any country -- can only be encountered within its borders. It is home to 502 types of mammals, 290 species of birds, 1,150 v","Mexico hosts to up to 10 percent of all known species on Earth.\nIt is home to 502 types of mammals, 290 bird species and 26,000 types of plants.\nHuman development"
1,"<s>Some U.S. officials this year are expected to get smartphones capable of handling classified government documents over cellular networks, according to people involved in the project. The phones will run a modified version of Google's Android software, which is being developed as part of an initiative that spans multiple federal agencies and government contractors, these people said. The smartphones are first being deployed to U.S. soldiers, people familiar with the project said. Later, federal agencies are expected to get phones for sending and receiving government cables while away from their offices, sources said. Eventually, local governments and corporations could give workers phones with similar software. The Army has been testing touchscreen devices at U.S. bases for nearly two years, said Michael McCarthy, a director for the Army's Brigade Modernization Command, in a phone interview. About 40 phones were sent to fighters overseas a year ago, and the Army plans to ship 50 more","Government, military officials to get Android phones capable of sharing secret documents.\nThe phones will run a modified version of Google's Android software, sources say.\nContractor: Google ""more"


=== google/mt5-small ===

architecture:	mt5
tokenizer:	T5TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"(CNN) -- Home to up to 10 percent of all known species, Mexico is recognized as one of the most biodiverse regions on the planet. The twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. And there is a great deal to lose. In the United Nations Environment Program (UNEP) World Conservation Monitoring Centre's list of megadiverse countries Mexico ranks 11th. The list represents a group of 17 countries that harbor the majority of the Earth's species and are therefore considered extremely biodiverse. From its coral reefs in the Caribbean Sea to its tropical jungles in Chiapas and the Yucatan peninsula and its deserts and prairies in the north, Mexico boasts an incredibly rich variety of flora and fauna. Some 574 out of 717 reptile species found in Mexico -- the most in any country -- can only be encountered within its borders. It is home to 502 types of mammals, 290 species of birds,</s>","Mexico hosts to up to 10 percent of all known species on Earth. It is home to 502 types of mammals, 290 bird species and 26,000 types of"
1,"As a growing number of airplanes scoured the southern Indian Ocean in the search for Malaysia Airlines Flight 370, authorities released new details that paint a different picture of what may have happened in the plane's cockpit. Military radar tracking shows that the aircraft changed altitude after making a sharp turn over the South China Sea as it headed toward the Strait of Malacca, a source close to the investigation into the missing flight told CNN. The plane flew as low as 12,000 feet at some point before it disappeared from radar, according to the source. The sharp turn seemed to be intentional, the source said, because executing it would have taken the Boeing 777 two minutes -- a time period during which the pilot or co-pilot could have sent an emergency signal if there had been a fire or other emergency onboard. Authorities say the plane didn't send any emergency signals, though some analysts say it's still unclear whether the pilots tried but weren't able to communicate becaus","U.S. Navy sending listening device to help find voice and data recorders if wreckage is found. Source: Plane changed altitude, flying as low"


=== google/pegasus-cnn_dailymail ===

architecture:	pegasus
tokenizer:	PegasusTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"(CNN) -- Home to up to 10 percent of all known species, Mexico is recognized as one of the most biodiverse regions on the planet. The twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. And there is a great deal to lose. In the United Nations Environment Program (UNEP) World Conservation Monitoring Centre's list of megadiverse countries Mexico ranks 11th. The list represents a group of 17 countries that harbor the majority of the Earth's species and are therefore considered extremely biodiverse. From its coral reefs in the Caribbean Sea to its tropical jungles in Chiapas and the Yucatan peninsula and its deserts and prairies in the north, Mexico boasts an incredibly rich variety of flora and fauna. Some 574 out of 717 reptile species found in Mexico -- the most in any country -- can only be encountered within its borders. It is home to 502 types of mammals, 290 species of birds, 1,150 vari","Mexico hosts to up to 10 percent of all known species on Earth. It is home to 502 types of mammals, 290 bird species and 26,000 types of plants. Human development and climate change"
1,"Washington (CNN)Almost immediately following the news of the first terrorist attacks that eventually killed 17 people across France, the global community united around a Twitter hashtag ""Je suis Charlie"" and just days later foreign leaders linked arms with their French counterparts to lead a historic million-person strong rally. Meanwhile, explosives strapped to a girl who appeared to be about 10-years-old detonated on Saturday, killing at least 20 people, in a country whose encounters with terrorism were also punctuated by a hashtag -- this time ""#BringBackOurGirls"" of Nigeria. Boko Haram militants killed as many as 2,000 people, mostly civilians,in a massacre that started the weekend before the terror attack on Charlie Hedbo in downtown Paris. Both the attacks in Nigeria and those in Paris are shocking and horrifying in their own respects, and yet one fomented an unprecedented international reaction -- a popular show of force that rivaled even the reaction to 9/11 -- while the respon","France and Nigeria experienced waves of terrorism during the first weeks of 2015. While the terror attacks in Paris sparked international unified outrage, reaction to Nigeria was more muted. Symbolism, politics and"


=== t5-small ===

architecture:	t5
tokenizer:	T5TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"summarize: Dan Condon believes in recycling. Just not when it comes to his hotel towels. Condon composts when he's at home in Boulder, Colorado. He eats local, organic and fair-trade food and drives a Honda CR-Z hybrid sports car. You might call him green. Except he's not so green when he travels for his work at an education nonprofit and stays in a hotel, which happens about 10 weeks per year. There, he uses a new towel every day. And don't try to bribe him with a drink or dessert coupon to get him to reuse the same one. ""I could care less about rewards for environmentally conscious behavior unless it's miles,"" Condon wrote in an e-mail. If hotels can't convince a hybrid-driving recycling enthusiast like Condon to go green while traveling, how can they possibly convince everyone else? 9 glamorous movie-star hotels. That's the problem of hotels trying to ""green"" your hotel stay. After guests have paid a pretty penny for a night at the inn, even the most environmental guests may want to","Hotel guests who ""go green"" are happier with their stay. Increasing water and energy costs are pushing hotels to cut costs wherever they can. Many hotels find that guests don't"
1,"summarize: Washington (CNN) -- Moments after the U.S. Supreme Court ruled on a pair of same-sex marriage cases, the handful of Democrats considering running for president in 2016 stumbled over themselves in a rush of celebratory reaction, blasting out a salvo of congratulatory press releases and tweets. Rulings hailed as historic victory. Voting 5-4 in each of two decisions, the justices struck down part of the Defense of Marriage Act that denied federal benefits to same-sex couples and cleared the way for gays and lesbians to once again marry in California. Former Secretary of State Hillary Clinton, whose husband signed DOMA into law in 1996, applauded the decision. ""By overturning the Defense of Marriage Act, the court recognized that discrimination towards any group holds us all back in our efforts to form a more perfect union,"" Clinton said in a joint statement with her husband, Bill Clinton. Maryland Gov. Martin O'Malley called the rulings ""a powerful step forward."" New York Gov.","Democrats who support same-sex marriage are now playing on increasingly friendly political turf. GOP leaders express dismay at ruling, but seem eager to be rid of a polarizing issue"


=== microsoft/prophetnet-large-uncased ===





architecture:	prophetnet
tokenizer:	ProphetNetTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"( cnn ) - - home to up to 10 percent of all known species, mexico is recognized as one of the most biodiverse regions on the planet. the twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. and there is a great deal to lose. in the united nations environment program ( unep ) world conservation monitoring centre's list of megadiverse countries mexico ranks 11th. the list represents a group of 17 countries that harbor the majority of the earth's species and are therefore considered extremely biodiverse. from its coral reefs in the caribbean sea to its tropical jungles in chiapas and the yucatan peninsula and its deserts and prairies in the north, mexico boasts an incredibly rich variety of flora and fauna. some 574 out of 717 reptile species found in mexico - - the most in any country - - can only be encountered within its borders. it is home to 502 types of mammals, 290 species of birds, 1,","mexico hosts to up to 10 percent of all known species on earth. it is home to 502 types of mammals, 290 bird species and 26, 000 types of plants. human development and climate"
1,"washington ( cnn ) almost immediately following the news of the first terrorist attacks that eventually killed 17 people across france, the global community united around a twitter hashtag "" je suis charlie "" and just days later foreign leaders linked arms with their french counterparts to lead a historic million - person strong rally. meanwhile, explosives strapped to a girl who appeared to be about 10 - years - old detonated on saturday, killing at least 20 people, in a country whose encounters with terrorism were also punctuated by a hashtag - - this time "" # bringbackourgirls "" of nigeria. boko haram militants killed as many as 2, 000 people, mostly civilians, in a massacre that started the weekend before the terror attack on charlie hedbo in downtown paris. both the attacks in nigeria and those in paris are shocking and horrifying in their own respects, and yet one fomented an unprecedented international reaction - - a popular show of force that rivaled even the reaction to 9 / 11","france and nigeria experienced waves of terrorism during the first weeks of 2015. while the terror attacks in paris sparked international unified outrage, reaction to nigeria was more muted. symbolism, politics and media all played"


=== microsoft/xprophetnet-large-wiki100-cased ===

architecture:	xlm_prophetnet
tokenizer:	XLMProphetNetTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"(CNN) -- Home to up to 10 percent of all known species, Mexico is recognized as one of the most biodiverse regions on the planet. The twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. And there is a great deal to lose. In the United Nations Environment Program (UNEP) World Conservation Monitoring Centre's list of megadiverse countries Mexico ranks 11th. The list represents a group of 17 countries that harbor the majority of the Earth's species and are therefore considered extremely biodiverse. From its coral reefs in the Caribbean Sea to its tropical jungles in Chiapas and the Yucatan peninsula and its deserts and prairies in the north, Mexico boasts an incredibly rich variety of flora and fauna. Some 574 out of 717 reptile species found in Mexico -- the most in any country -- can only be encountered within its borders. It is home to 502 types of mammals, 290 species of birds, 1,150 vari","Mexico hosts to up to 10 percent of all known species on Earth. It is home to 502 types of mammals, 290 bird species and 26,000 types of plants"
1,"Dan Condon believes in recycling. Just not when it comes to his hotel towels. Condon composts when he's at home in Boulder, Colorado. He eats local, organic and fair-trade food and drives a Honda CR-Z hybrid sports car. You might call him green. Except he's not so green when he travels for his work at an education nonprofit and stays in a hotel, which happens about 10 weeks per year. There, he uses a new towel every day. And don't try to bribe him with a drink or dessert coupon to get him to reuse the same one. ""I could care less about rewards for environmentally conscious behavior unless it's miles,"" Condon wrote in an e-mail. If hotels can't convince a hybrid-driving recycling enthusiast like Condon to go green while traveling, how can they possibly convince everyone else? 9 glamorous movie-star hotels. That's the problem of hotels trying to ""green"" your hotel stay. After guests have paid a pretty penny for a night at the inn, even the most environmental guests may want to treat them","Hotel guests who ""go green"" are happier with their stay. Increasing water and energy costs are pushing hotels to cut costs wherever they can. Many hotels find that"


In [None]:
# slow
# hide_input
test_results_df = pd.DataFrame(test_results, columns=["arch", "tokenizer", "model_name", "result", "error"])
display_df(test_results_df)


Unnamed: 0,arch,tokenizer,model_name,result,error
0,bart,BartTokenizerFast,facebook/bart-base,PASSED,
1,blenderbot_small,BlenderbotSmallTokenizer,facebook/blenderbot_small-90M,PASSED,
2,led,LEDTokenizerFast,allenai/led-base-16384,PASSED,
3,mt5,T5TokenizerFast,google/mt5-small,PASSED,
4,pegasus,PegasusTokenizerFast,google/pegasus-cnn_dailymail,PASSED,
5,t5,T5TokenizerFast,t5-small,PASSED,
6,prophetnet,ProphetNetTokenizer,microsoft/prophetnet-large-uncased,PASSED,
7,xlm_prophetnet,XLMProphetNetTokenizer,microsoft/xprophetnet-large-wiki100-cased,PASSED,


## Summary

This module includes the fundamental data preprocessing bits to use Blurr for summarization.

In [None]:
# hide
from nbdev.export import notebook2script

notebook2script()


Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01_modeling-core.ipynb.
Converted 02_data-language-modeling.ipynb.
Converted 02_modeling-language-modeling.ipynb.
Converted 03_data-token-classification.ipynb.
Converted 03_modeling-token-classification.ipynb.
Converted 04_data-question-answering.ipynb.
Converted 04_modeling-question-answering.ipynb.
Converted 10_data-seq2seq-core.ipynb.
Converted 10_modeling-seq2seq-core.ipynb.
Converted 11_data-seq2seq-summarization.ipynb.
Converted 11_modeling-seq2seq-summarization.ipynb.
Converted 12_data-seq2seq-translation.ipynb.
Converted 12_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-high-level-api.ipynb.
Converted 99b_examples-glue.ipynb.
Converted 99c_examples-glue-plain-pytorch.ipynb.
Converted 99d_examples-multilabel.ipynb.
Converted 99e_examples-causal-lm-gpt2.ipynb.
Converted index.ipynb.
