In [None]:
# default_exp data.summarization

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data.summarization

> This module contains the bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data for summarization tasks using architectures like BART, T5, or good ol' GPT2, etc...

In [None]:
#export
import ast
from functools import reduce

import torch
from transformers import *
from fastai2.text.all import *

from blurr.utils import *
from blurr.data.core import *

In [None]:
#hide
import pdb

from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#cuda
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


## Summarization tokenization, batch transform, and DataBlock methods

Summarization tasks attempt to capture the meaning of a larger body of text in 1-3 sentences.

In [None]:
path = Path('./')
cnndm_df = pd.read_csv(path/'cnndm_sample.csv'); len(cnndm_df)

1000

In [None]:
cnndm_df.head(2)

Unnamed: 0,article,highlights,ds_type
0,"(CNN) -- Globalization washes like a flood over the world's cultures and economies. Floods can be destructive; however, they can also bring blessings, as the annual floods of the Nile did for ancient Egypt. The world's great universities can be crucial instruments in shaping, in a positive way, humankind's reaction to globalization and the development of humankind itself. Traditionally, universities have been defined and limited by location, creating an academic community and drawing students and scholars to that place. Eventually, some universities began to encourage students to study el...","John Sexton: Traditionally, universities have been defined and limited by location .\nGlobal campuses form a network of thought, innovation, he writes .\nFaculty can teach, Sexton says, students can team up in many cities at once .\nSexton: Research, scholarship can be shared and cultural ties made in ""century of knowledge""",train
1,"(CNN) -- Armenian President Robert Kocharian declared a state of emergency Saturday night after a day of clashes between police and protesters, a spokeswoman for the Armenian Foreign Ministry said. Opposition supporters wave an Armenian flag during a protest rally in Yerevan, Armenia, on Saturday. The protesters claim last month's presidential election was rigged. The state of emergency will ""hopefully bring some order"" to the capital, Yerevan, said Salpi Ghazarian, assistant to the Armenian foreign minister, who spoke to CNN early Sunday. The state of emergency could last until March 20, ...","NEW: Protest moves after crackdown at Freedom Square .\nOrder sought after protests over last month's election turn violent .\nDemonstrators say the election was fraudulent .\nState of emergency could last until March 20, official says .",train


In [None]:
BartForConditionalGeneration

transformers.modeling_bart.BartForConditionalGeneration

In [None]:
pretrained_model_name = "bart-large-cnn"

hf_arch, hf_tokenizer, hf_config, hf_model = \
    BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, BartTokenizer, HF_MODELS.BartForConditionalGeneration)

hf_arch, type(hf_tokenizer), type(hf_config), type(hf_model)

('bart',
 transformers.tokenization_bart.BartTokenizer,
 transformers.configuration_bart.BartConfig,
 transformers.modeling_bart.BartForConditionalGeneration)

In [None]:
#export
class HF_SummaryInput(list): pass

In [None]:
#export
@typedispatch
def build_hf_input(task:ForConditionalGenerationTask, tokenizer, a_tok_ids, b_tok_ids=None, targets=None,
                   max_length=512, pad_to_max_length=True, truncation_strategy='longest_first', 
                   trg_tok_kwargs={}):

    base_res = build_hf_input(None, tokenizer, a_tok_ids, b_tok_ids, targets,
                              max_length, pad_to_max_length, truncation_strategy, {})
    
    max_length = trg_tok_kwargs['max_length'] if ('max_length' in trg_tok_kwargs) else max_length
    ts = trg_tok_kwargs['truncation_strategy'] if ('truncation_strategy' in trg_tok_kwargs) else truncation_strategy
    
    dec_res = tokenizer.prepare_for_model(base_res[1][0].tolist(), None, 
                                      max_length=max_length, 
                                      pad_to_max_length=pad_to_max_length,
                                      truncation_strategy=ts, 
                                      return_tensors='pt')
    
    return HF_SummaryInput(base_res[0]), tuplify(dec_res['input_ids'][0])

In [None]:
blocks = ( 
    HF_TextBlock(hf_arch, hf_tokenizer), 
    HF_TextBlock(hf_arch, hf_tokenizer, task=ForConditionalGenerationTask(), trg_tok_kwargs={'max_length': 50})
)

dblock = DataBlock(blocks=blocks, 
                   get_x=ColReader('article'), 
                   get_y=ColReader('highlights'), 
                   splitter=RandomSplitter())

In [None]:
# dblock.summary(cnndm_df)

In [None]:
dls = dblock.dataloaders(cnndm_df, bs=4)

In [None]:
b = dls.one_batch()

In [None]:
len(b), b[0][0].shape, b[1].shape

(2, torch.Size([4, 512]), torch.Size([4, 50]))

In [None]:
#export
@typedispatch
def show_batch(x:HF_SummaryInput, y, samples, hf_tokenizer, skip_special_tokens=True, ctxs=None, max_n=6, **kwargs):  
    res = L()
    for inp, trg in zip(x[0], y):
        txt = hf_tokenizer.decode(inp, skip_special_tokens=skip_special_tokens).replace(hf_tokenizer.pad_token, '')
        s = hf_tokenizer.decode(trg, skip_special_tokens=skip_special_tokens).replace(hf_tokenizer.pad_token, '')
        res.append((txt, s))
                       
    display_df(pd.DataFrame(res, columns=['text', 'summary'])[:max_n])
    return ctxs

In [None]:
dls.show_batch(hf_tokenizer=hf_tokenizer, max_n=2)

Unnamed: 0,text,summary
0,"(CNN) -- Home to up to 10 percent of all known species, Mexico is recognized as one of the most biodiverse regions on the planet. The twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. And there is a great deal to lose. In the United Nations Environment Program (UNEP) World Conservation Monitoring Centre's list of megadiverse countries Mexico ranks 11th. The list represents a group of 17 countries that harbor the majority of the Earth's species and are therefore considered extremely biodiverse. From its coral reefs in the Caribbean Sea to its tropical jungles in Chiapas and the Yucatan peninsula and its deserts and prairies in the north, Mexico boasts an incredibly rich variety of flora and fauna. Some 574 out of 717 reptile species found in Mexico -- the most in any country -- can only be encountered within its borders. It is home to 502 types of mammals, 290 species of birds, 1,150 varieties of birds and 26,000 classifications of plants. Pronatura, a non-profit organization that works to promote conservation and sustainable development in Mexico, has selected six species which it says symbolize the problems faced by the destruction of nature. ""These are only some of the species which have some degree of conservation,"" says Eduardo Cota Corona, Director of Conservation at Pronatura. ""However, there is a countless number of species in Mexico which find themselves in danger of extinction."" Golden Eagle. It is the country's national symbol yet the Golden Eagle is close to extinction in Mexico. One of the largest raptors or birds of prey in the world, the Golden Eagle's wingspan can reach lengths greater than two metres. Only the Bald Eagle and the California Greater exceed it in size in North America. With its powerful hooked bill and long and sharp claws it can sometimes capture prey of a size that is surprising for its size, including crane, wild ungulates and domestic livestock, though more often than not it tends to feed off small mammals such as rabbits, hares, ground squirrels and prairie dogs as well as reptiles and small-to-medium sized birds. Primarily a solitary bird, the Golden Eagle pairs up to breed, building nests made of dry branches in cliffs and escarpments. The female typically lays two eggs which are incubated by both the male and female. Usually, only one of the hatchlings survives. The Golden Eagle can be found in","Mexico hosts to up to 10 percent of all known species on Earth.\nIt is home to 502 types of mammals, 290 bird species and 26,000 types of plants.\nHuman development and climate change is placing a big strain on its"
1,"Watch your step as you climb these stairs, whether spiraling up mountains, narrow passageways or sky-scraping attractions. All is right with the world when you're gazing down from the rooftop of Milan's Duomo. That is, until you remember the steep marble stairs that got you there—and are your only way down. Stairways can leave just as much of an impact on your memory as the places they lead you. Some are so eye-catching they look like they belong in an M.C. Escher painting, while other stairs are downright intimidating, especially when they stand between you and a site you flew halfway across the world to experience. In Peru, for instance, travelers need to tackle about 600 feet of slippery granite rocks carved into the mountainside to reach the Moon Temple at Machu Picchu. And at Yosemite National Park, you can't take a selfie at the top of Half Dome without climbing a cable ladder up the rock face for more than 400 feet. All it takes is a misstep for any old staircase to become treacherous (just ask Jennifer Lawrence), yet some standout for being especially scary. A set of stairs in Hawaii is so precariously perched that climbing is now illegal. In China, there's a stairway with an age requirement. Other stairs are intimidating for more psychological reasons, such as the creaking noises made by the world's longest wooden stairway in Norway or the eerie atmosphere at ""The Stairway to Hell,"" part of an abandoned industrial complex in Japan. Travelers with nerves of steel—and eager for bragging rights—follow these stairs because of what they find at the end, whether a sacred Hindu temple or the top of a spectacular waterfall. There's nothing quite like the thrill of accomplishment that comes once you've taken that last step. Safely, that is. Angkor Wat Temple Stairs, Cambodia. In this super-humid hotbox of Buddhist history, there's no shame in bowing down on your hands and knees or pulling yourself up with the provided ropes to scale the nearly 70 percent inclined stairs of Angkor Wat's uppermost temples. Guides claim the steps were made to be so steep to remind people that heaven was hard to reach—though you might make the same argument about Earth as you try not to tumble on the way down. The Verrückt, Kansas City, Kansas. It takes guts just to reach the starting point of the world's tallest and fastest water slide, opened July 2014. To",Moon Temple at Machu Picchu can be reached by 600-foot slippery granite steps.\nThe journey to Yosemite's Half Dome requires a long cable ladder.\nA misstep on any of these staircases can be lethal.


## Cleanup

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01a_data-language-modeling.ipynb.
Converted 01c_data-question-answering.ipynb.
Converted 01d_data-token-classification.ipynb.
Converted 01e_data-summarization.ipynb.
Converted 02_modeling-core.ipynb.
Converted 02a_modeling-language-modeling.ipynb.
Converted 02c_modeling-question-answering.ipynb.
Converted 02d_modeling-token-classification.ipynb.
Converted index.ipynb.
