In [None]:
# default_exp data.seq2seq.summarization

In [None]:
#all_slow

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# data.seq2seq.summarization

> This module contains the bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data for summarization tasks using architectures like BART and T5.

In [None]:
#export
import ast
from functools import reduce

import torch
from transformers import *
from fastai.text.all import *

from blurr.utils import *
from blurr.data.core import *
from blurr.data.seq2seq.core import *

logging.set_verbosity_error()

In [None]:
#hide
import pdb

from nbdev.showdoc import *
from fastcore.test import *

from fastai import __version__ as fa_version
from torch import __version__ as pt_version
from transformers import __version__ as hft_version

print(f'Using pytorch {pt_version}')
print(f'Using fastai {fa_version}')
print(f'Using transformers {hft_version}')

Using pytorch 1.7.1+cu110
Using fastai 2.2.7
Using transformers 4.3.3


In [None]:
#cuda
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


## Summarization tokenization, batch transform, and DataBlock methods

Summarization tasks attempt to generate a human-understandable and sensible representation of a larger body of text (e.g., capture the meaning of a larger document in 1-3 sentences).

In [None]:
path = Path('./')
cnndm_df = pd.read_csv(path/'cnndm_sample.csv'); len(cnndm_df)

1000

In [None]:
cnndm_df.head(2)

Unnamed: 0,article,highlights,ds_type
0,"(CNN) -- Globalization washes like a flood over the world's cultures and economies. Floods can be destructive; however, they can also bring blessings, as the annual floods of the Nile did for ancient Egypt. The world's great universities can be crucial instruments in shaping, in a positive way, humankind's reaction to globalization and the development of humankind itself. Traditionally, universities have been defined and limited by location, creating an academic community and drawing students and scholars to that place. Eventually, some universities began to encourage students to study el...","John Sexton: Traditionally, universities have been defined and limited by location .\nGlobal campuses form a network of thought, innovation, he writes .\nFaculty can teach, Sexton says, students can team up in many cities at once .\nSexton: Research, scholarship can be shared and cultural ties made in ""century of knowledge""",train
1,"(CNN) -- Armenian President Robert Kocharian declared a state of emergency Saturday night after a day of clashes between police and protesters, a spokeswoman for the Armenian Foreign Ministry said. Opposition supporters wave an Armenian flag during a protest rally in Yerevan, Armenia, on Saturday. The protesters claim last month's presidential election was rigged. The state of emergency will ""hopefully bring some order"" to the capital, Yerevan, said Salpi Ghazarian, assistant to the Armenian foreign minister, who spoke to CNN early Sunday. The state of emergency could last until March 20, ...","NEW: Protest moves after crackdown at Freedom Square .\nOrder sought after protests over last month's election turn violent .\nDemonstrators say the election was fraudulent .\nState of emergency could last until March 20, official says .",train


In [None]:
pretrained_model_name = "facebook/bart-large-cnn"
task = HF_TASKS_AUTO.Seq2SeqLM

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, task=task)

hf_arch, type(hf_tokenizer), type(hf_config), type(hf_model)

('bart',
 transformers.models.bart.tokenization_bart_fast.BartTokenizerFast,
 transformers.models.bart.configuration_bart.BartConfig,
 transformers.models.bart.modeling_bart.BartForConditionalGeneration)

In [None]:
blocks = (HF_Seq2SeqBlock(hf_arch, hf_config, hf_tokenizer, hf_model), noop)

dblock = DataBlock(blocks=blocks, 
                   get_x=ColReader('article'), 
                   get_y=ColReader('highlights'), 
                   splitter=RandomSplitter())

Two lines!  Notice we pass in `noop` for our targets (e.g. our summaries) because the batch transform will take care of both out inputs and targets.

In [None]:
# dblock.summary(cnndm_df)

In [None]:
dls = dblock.dataloaders(cnndm_df, bs=4)

In [None]:
b = dls.one_batch()

In [None]:
len(b), b[0]['input_ids'].shape, b[0]['labels'].shape, b[1].shape

(2, torch.Size([4, 1024]), torch.Size([4, 84]), torch.Size([4, 84]))

In [None]:
dls.show_batch(dataloaders=dls, max_n=2, input_trunc_at=1000, target_trunc_at=250)

Unnamed: 0,text,target
0,"Dan Condon believes in recycling. Just not when it comes to his hotel towels. Condon composts when he's at home in Boulder, Colorado. He eats local, organic and fair-trade food and drives a Honda CR-Z hybrid sports car. You might call him green. Except he's not so green when he travels for his work at an education nonprofit and stays in a hotel, which happens about 10 weeks per year. There, he uses a new towel every day. And don't try to bribe him with a drink or dessert coupon to get him to reuse the same one. ""I could care less about rewards for environmentally conscious behavior unless it's miles,"" Condon wrote in an e-mail. If hotels can't convince a hybrid-driving recycling enthusiast like Condon to go green while traveling, how can they possibly convince everyone else? 9 glamorous movie-star hotels. That's the problem of hotels trying to ""green"" your hotel stay. After guests have paid a pretty penny for a night at the inn, even the most environmental guests may want to treat the","Hotel guests who ""go green"" are happier with their stay.\nIncreasing water and energy costs are pushing hotels to cut costs wherever they can.\nMany hotels find that guests don't mind using the same towels and sheets every night.\nTripAdvisor will be a"
1,"Washington (CNN) -- Few answers have emerged to the myriad questions about the Boston Marathon bombing and its aftermath, but that didn't stop political leaders from clashing about what happened and why it did on Sunday talk shows. Republican members of Congress played up a possible connection to global terrorists and said the lone surviving suspect should be designated an enemy combatant to allow unfettered questioning and unlimited detention. Democratic legislators called for handling the 19-year-old suspect as a crime suspect rather than a war enemy, allowing the U.S. citizen the right to legal representation under federal law that could impose the death penalty. A closer look at their statements and arguments showed how politicians blend facts, conjecture and spin to push their side's agenda while countering arguments from across the aisle. The facts so far tell a still-convoluted story. Tamerlan and Dzhokhar Tsarnaev, brothers of northern Caucasus origin who had lived in the Unit","Partisan posturing emerges over Boston bombings on Sunday talk shows.\nDespite little evidence, Republicans hint of possible international terror ties.\nDemocrats argue against designating the suspect an enemy combatant.\nAuthors of immigration reform"


## Tests

The purpose of the following tests is to ensure as much as possible, that the core DataBlock code above works for the pretrained **summarization models** below.  These tests are excluded from the CI workflow because of how long they would take to run and the amount of data that would be required to download.

**Note**: Feel free to modify the code below to test whatever pretrained summarization models you are working with ... and if any of your pretrained summarization models fail, please submit a github issue *(or a PR if you'd like to fix it yourself)*

In [None]:
[ model_type for model_type in BLURR_MODEL_HELPER.get_models(task='ConditionalGeneration') 
 if (not model_type.__name__.startswith('TF')) ]

[transformers.models.bart.modeling_bart.BartForConditionalGeneration,
 transformers.models.blenderbot.modeling_blenderbot.BlenderbotForConditionalGeneration,
 transformers.models.blenderbot_small.modeling_blenderbot_small.BlenderbotSmallForConditionalGeneration,
 transformers.models.fsmt.modeling_fsmt.FSMTForConditionalGeneration,
 transformers.models.led.modeling_led.LEDForConditionalGeneration,
 transformers.models.mbart.modeling_mbart.MBartForConditionalGeneration,
 transformers.models.mt5.modeling_mt5.MT5ForConditionalGeneration,
 transformers.models.pegasus.modeling_pegasus.PegasusForConditionalGeneration,
 transformers.models.prophetnet.modeling_prophetnet.ProphetNetForConditionalGeneration,
 transformers.models.t5.modeling_t5.T5ForConditionalGeneration,
 transformers.models.xlm_prophetnet.modeling_xlm_prophetnet.XLMProphetNetForConditionalGeneration]

In [None]:
pretrained_model_names = [
    'facebook/bart-base',
    'facebook/blenderbot_small-90M',
    'allenai/led-base-16384',
    'google/mt5-small',
    'google/pegasus-cnn_dailymail',
    't5-small', 
    'microsoft/prophetnet-large-uncased',
    'microsoft/xprophetnet-large-wiki100-cased', # XLMProphetNet
]

In [None]:
path = Path('./')
cnndm_df = pd.read_csv(path/'cnndm_sample.csv')

In [None]:
#slow
#hide_output
task = HF_TASKS_AUTO.Seq2SeqLM
bsz = 2
seq_sz = 256
trg_seq_sz = 40

test_results = []
for model_name in pretrained_model_names:
    error=None
    
    print(f'=== {model_name} ===\n')
    
    hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(model_name, task=task)
    print(f'architecture:\t{hf_arch}\ntokenizer:\t{type(hf_tokenizer).__name__}\n')
    
    # not all architectures include a native pad_token (e.g., gpt2, ctrl, etc...), so we add one here
    if (hf_tokenizer.pad_token is None): 
        hf_tokenizer.add_special_tokens({'pad_token': '<pad>'})  
        hf_config.pad_token_id = hf_tokenizer.get_vocab()['<pad>']
        hf_model.resize_token_embeddings(len(hf_tokenizer))   
    
    before_batch_tfm = HF_Seq2SeqBeforeBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model,
                                                      padding='max_length', 
                                                      max_length=seq_sz, 
                                                      max_target_length=trg_seq_sz)
    
    def add_t5_prefix(inp): return f'summarize: {inp}' if (hf_arch == 't5') else inp
    
    blocks = (HF_Seq2SeqBlock(before_batch_tfm=before_batch_tfm), noop)
    dblock = DataBlock(blocks=blocks, 
                   get_x=Pipeline([ColReader('article'), add_t5_prefix]), 
                   get_y=ColReader('highlights'), 
                   splitter=RandomSplitter())

    dls = dblock.dataloaders(cnndm_df, bs=bsz) 
    b = dls.one_batch()

    try:
        print('*** TESTING DataLoaders ***\n')
        test_eq(len(b), 2)
        test_eq(len(b[0]['input_ids']), bsz)
        test_eq(b[0]['input_ids'].shape, torch.Size([bsz, seq_sz]))
        test_eq(len(b[1]), bsz)
        test_eq(b[1].shape, torch.Size([bsz, trg_seq_sz]))

        if (hasattr(hf_tokenizer, 'add_prefix_space') and hf_arch not in ['led']):
             test_eq(hf_tokenizer.add_prefix_space, True)
            
        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, 'PASSED', ''))
        dls.show_batch(dataloaders=dls, max_n=2, input_trunc_at=1000)
        
    except Exception as err:
        test_results.append((hf_arch, type(hf_tokenizer).__name__, model_name, 'FAILED', err))

=== facebook/bart-base ===

architecture:	bart
tokenizer:	BartTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"(CNN) -- Home to up to 10 percent of all known species, Mexico is recognized as one of the most biodiverse regions on the planet. The twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. And there is a great deal to lose. In the United Nations Environment Program (UNEP) World Conservation Monitoring Centre's list of megadiverse countries Mexico ranks 11th. The list represents a group of 17 countries that harbor the majority of the Earth's species and are therefore considered extremely biodiverse. From its coral reefs in the Caribbean Sea to its tropical jungles in Chiapas and the Yucatan peninsula and its deserts and prairies in the north, Mexico boasts an incredibly rich variety of flora and fauna. Some 574 out of 717 reptile species found in Mexico -- the most in any country -- can only be encountered within its borders. It is home to 502 types of mammals, 290 species of birds, 1,150 var","Mexico hosts to up to 10 percent of all known species on Earth.\nIt is home to 502 types of mammals, 290 bird species and 26,000 types of plants.\nHuman development"
1,"(CNN) -- Former Rep. Gabrielle Giffords and her husband Mark Kelly have launched what they hope will mark a new era in the battle over gun rights in America. On the second anniversary of a mass shooting in Arizona that wounded Giffords and killed six others, the couple launched a political action committee, Americans for Responsible Solutions, along with a website calling for contributions to help ""encourage elected officials to stand up for solutions to prevent gun violence and protect responsible gun ownership."" In an op-ed in USA Today, the two make their goal clear: to counter the influence of the gun lobby. ""Special interests purporting to represent gun owners but really advancing the interests of an ideological fringe have used big money and influence to cow Congress into submission,"" they write. ""Rather than working to find the balance between our rights and the regulation of a dangerous product, these groups have cast simple protections for our communities as existential threa",NEW: Giffords and Kelly cite overall figure for gun deaths which includes suicides.\nThe two launched a political action committee to raise money to counter the gun lobby.\nA Connecticut lawmaker


=== facebook/blenderbot_small-90M ===

architecture:	blenderbot_small
tokenizer:	BlenderbotSmallTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"dan condon believes in recycling. just not when it comes to his hotel towels. condon composts when he's at home in boulder, colorado. he eats local, organic and fairtrade food and drives a honda crz hybrid sports car. you might call him green. except he's not so green when he travels for his work at an education nonprofit and stays in a hotel, which happens about 10 weeks per year. there, he uses a new towel every day. and don't try to bribe him with a drink or dessert coupon to get him to reuse the same one. i could care less about rewards for environmentally conscious behavior unless it's miles "" condon wrote in an email. if hotels can't convince a hybriddriving recycling enthusiast like condon to go green while traveling, how can they possibly convince everyone else? 9 glamorous moviestar hotels. that's the problem of hotels trying to green"" your hotel stay. after guests have paid a pretty penny for a night at the inn, even the most environmental guests may want to treat themselves","hotel guests who go green"" are happier with their stay. __newln__ increasing water and energy costs are pushing hotels to cut costs wherever they can. __newln__ many hotels find that guests don't mind using"
1,"cnn ) - wondering where to go for your next holiday? experts explain which destinations we should be checking out in 2014. brazil: the world cup. the modern game of football, or soccer, may have been born in england's public schools, but many will claim its soul has settled in brazil. it has the world's most successful international team, winning the world cup five times. it calls what many claim to be the world's greatest player, pele, one of its own. and company managers and bosses are known to demand their employees skip work to watch the big games. the world cup next year then, to be hosted over june and july in 18 cities across the country, is likely to be memorable, to say the least. throw in the rest of the country, which includes rainforests, beaches and a party culture that makes most new year's eve soirees look decidedly pofaced, and you have the makings of an epic trip. daisy parker from abta association of british travel agents ) suggests visiting some of the country's more",new zealand government threw 50 million into the construction of the nga haerenga cycle trails. __newln__ nosara in costa rica recently awarded a blue flag - a certification awarded to world '


=== allenai/led-base-16384 ===

architecture:	led
tokenizer:	LEDTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"Dan Condon believes in recycling. Just not when it comes to his hotel towels. Condon composts when he's at home in Boulder, Colorado. He eats local, organic and fair-trade food and drives a Honda CR-Z hybrid sports car. You might call him green. Except he's not so green when he travels for his work at an education nonprofit and stays in a hotel, which happens about 10 weeks per year. There, he uses a new towel every day. And don't try to bribe him with a drink or dessert coupon to get him to reuse the same one. ""I could care less about rewards for environmentally conscious behavior unless it's miles,"" Condon wrote in an e-mail. If hotels can't convince a hybrid-driving recycling enthusiast like Condon to go green while traveling, how can they possibly convince everyone else? 9 glamorous movie-star hotels. That's the problem of hotels trying to ""green"" your hotel stay. After guests have paid a pretty penny for a night at the inn, even the most environmental guests may want to treat them","Hotel guests who ""go green"" are happier with their stay.\nIncreasing water and energy costs are pushing hotels to cut costs wherever they can.\nMany hotels find that guests don"
1,"London (CNN) -- In 1948, a hospital outside London witnessed the birth of the Paralympic movement, as a Jewish doctor who had fled Nazi Germany sought to change the lives of patients with spinal injuries -- and inspire new hope in them through sport. The first ""Stoke Mandeville Games"" were organized in 1948 to coincide with the London Olympics, the second to be held in Britain. Named for the hospital in Buckinghamshire where Prof. Ludwig Guttmann's pioneering spinal injuries unit was based, the competitors in those initial Games -- 14 men and two women -- took part in a wheelchair archery contest. Many were military veterans injured on the battlefields of World War II. Just a year later, six teams competed at Stoke Mandeville -- with wheelchair netball, a forerunner of wheelchair basketball, being introduced -- as sport became a central part of a rehabilitation process that had been revolutionized by Guttmann. In 1956, a ""statement of intent"" was unveiled for the Games, which were by t","Paralympic movement was born in Stoke Mandeville, outside London, in 1948.\n2012 Games will be the biggest yet, with 4,200 competitors from 165 countries.\n"


=== google/mt5-small ===

architecture:	mt5
tokenizer:	T5TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"(CNN) -- Home to up to 10 percent of all known species, Mexico is recognized as one of the most biodiverse regions on the planet. The twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. And there is a great deal to lose. In the United Nations Environment Program (UNEP) World Conservation Monitoring Centre's list of megadiverse countries Mexico ranks 11th. The list represents a group of 17 countries that harbor the majority of the Earth's species and are therefore considered extremely biodiverse. From its coral reefs in the Caribbean Sea to its tropical jungles in Chiapas and the Yucatan peninsula and its deserts and prairies in the north, Mexico boasts an incredibly rich variety of flora and fauna. Some 574 out of 717 reptile species found in Mexico -- the most in any country -- can only be encountered within its borders. It is home to 502 types of mammals, 290 species of birds,","Mexico hosts to up to 10 percent of all known species on Earth. It is home to 502 types of mammals, 290 bird species and 26,000 types of"
1,"Police missed speaking with suspected Los Angeles International Airport shooter Paul Ciancia by ""a matter of minutes"" the day his family asked authorities to check on him after receiving disturbing messages, according to the chairman of the House Homeland Security Committee. By the time officers arrived at Ciancia's apartment Friday, he had already left -- as little as 45 minutes earlier -- for the airport, according to U.S. Rep. Michael McCaul. As authorities piece together Ciancia's actions leading up to the shooting, new details emerged Sunday from those close to the suspect. According to someone who knew Ciancia and his roommates well, Ciancia began asking one of his three roommates days before the shooting if he could get a ride to the airport. Ciancia told the roommate that his father, back in New Jersey, was sick and he needed to get home ""to go help take care of him,"" the source said. Ciancia didn't, however, indicate what day he needed to leave. On the day of the shooting, Cia",Paul Ciancia began asking for a ride to the airport days before the shooting. Police performing a welfare check at his family's request missed him by


=== google/pegasus-cnn_dailymail ===

architecture:	pegasus
tokenizer:	PegasusTokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"(CNN) -- Home to up to 10 percent of all known species, Mexico is recognized as one of the most biodiverse regions on the planet. The twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. And there is a great deal to lose. In the United Nations Environment Program (UNEP) World Conservation Monitoring Centre's list of megadiverse countries Mexico ranks 11th. The list represents a group of 17 countries that harbor the majority of the Earth's species and are therefore considered extremely biodiverse. From its coral reefs in the Caribbean Sea to its tropical jungles in Chiapas and the Yucatan peninsula and its deserts and prairies in the north, Mexico boasts an incredibly rich variety of flora and fauna. Some 574 out of 717 reptile species found in Mexico -- the most in any country -- can only be encountered within its borders. It is home to 502 types of mammals, 290 species of birds, 1,150 vari","Mexico hosts to up to 10 percent of all known species on Earth. It is home to 502 types of mammals, 290 bird species and 26,000 types of plants. Human development and climate change"
1,"(CNN) -- Two weeks. Two gut-wrenching, frustrating, mysterious weeks. That's how long it's been since 227 passengers and 12 crew members boarded Malaysia Airlines Flight 370, destined for Beijing. A routine trip, it seemed, to catch up relatives in time for the weekend, start on a work assignment or just get away. Where they got to, still unknown. An exhaustive search -- covering a mind-boggling 2.97 million square miles, which is nearly the size of the continental United States -- has yielded some clues, but no proof of where the Boeing 777 is or definitively what happened to it. The latest, most notable lead revolved around two large objects detected by satellite Sunday floating on waters over 1,400 miles off of Australia's west coast. The first of several Australian military planes, as well as two long-range commercial jets, resumed their search Saturday morning to find any trace of the objects, amid some skepticism that they or ships in the area ever will and, if they do, that what",NEW: Planes depart Australia to resume their search for airplane debris. NEW: Official: Passengers' relatives are moved to a different Kuala Lumpur hotel. Objects seen on satellite spark intensive search


=== t5-small ===

architecture:	t5
tokenizer:	T5TokenizerFast

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"summarize: (CNN) -- Home to up to 10 percent of all known species, Mexico is recognized as one of the most biodiverse regions on the planet. The twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. And there is a great deal to lose. In the United Nations Environment Program (UNEP) World Conservation Monitoring Centre's list of megadiverse countries Mexico ranks 11th. The list represents a group of 17 countries that harbor the majority of the Earth's species and are therefore considered extremely biodiverse. From its coral reefs in the Caribbean Sea to its tropical jungles in Chiapas and the Yucatan peninsula and its deserts and prairies in the north, Mexico boasts an incredibly rich variety of flora and fauna. Some 574 out of 717 reptile species found in Mexico -- the most in any country -- can only be encountered within its borders. It is home to 502 types of mammals, 290 species of birds,","Mexico hosts to up to 10 percent of all known species on Earth. It is home to 502 types of mammals, 290 bird species and 26,000 types of plants. Human development"
1,"summarize: (CNN) -- Two weeks. Two gut-wrenching, frustrating, mysterious weeks. That's how long it's been since 227 passengers and 12 crew members boarded Malaysia Airlines Flight 370, destined for Beijing. A routine trip, it seemed, to catch up relatives in time for the weekend, start on a work assignment or just get away. Where they got to, still unknown. An exhaustive search -- covering a mind-boggling 2.97 million square miles, which is nearly the size of the continental United States -- has yielded some clues, but no proof of where the Boeing 777 is or definitively what happened to it. The latest, most notable lead revolved around two large objects detected by satellite Sunday floating on waters over 1,400 miles off of Australia's west coast. The first of several Australian military planes, as well as two long-range commercial jets, resumed their search Saturday morning to find any trace of the objects, amid some skepticism that they or ships in the area ever will and, if they do",NEW: Planes depart Australia to resume their search for airplane debris. NEW: Official: Passengers' relatives are moved to a different Kuala Lumpur hotel.


=== microsoft/prophetnet-large-uncased ===

architecture:	prophetnet
tokenizer:	ProphetNetTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"( cnn ) - - home to up to 10 percent of all known species, mexico is recognized as one of the most biodiverse regions on the planet. the twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. and there is a great deal to lose. in the united nations environment program ( unep ) world conservation monitoring centre's list of megadiverse countries mexico ranks 11th. the list represents a group of 17 countries that harbor the majority of the earth's species and are therefore considered extremely biodiverse. from its coral reefs in the caribbean sea to its tropical jungles in chiapas and the yucatan peninsula and its deserts and prairies in the north, mexico boasts an incredibly rich variety of flora and fauna. some 574 out of 717 reptile species found in mexico - - the most in any country - - can only be encountered within its borders. it is home to 502 types of mammals, 290 species of birds, 1,","mexico hosts to up to 10 percent of all known species on earth. it is home to 502 types of mammals, 290 bird species and 26, 000 types of plants. human development and climate"
1,"( cnn student news ) - - october 27, 2009. downloadable maps. download pdf maps related to today's show : • afghanistan & pakistan • los angeles & san diego • ft. jackson, south carolina. transcript. this is a rush transcript. this copy may not be in its final form and may be updated. natisha lance, cnn student news anchor : a member of the military is making history. we'll explain how in today's edition of cnn student news. hi, everyone. carl azuz is off this week. i'm natisha lance. first up : afghan crashes. lance : first up, pakistan and afghanistan. the countries share a border, and they also share a common problem : threats from militant groups and terrorists like the taliban and al qaeda. it's an issue facing both nations'governments, and one that the u. s. government is concerned about as well. that's why president obama has been holding a series of meetings with some of his advisers. they're reviewing the u. s. strategy in afghanistan and pakistan. samantha hayes has the lates",consider u. s. efforts to offer afghan citizens an alternative to the taliban. hear how a proposed health care bill addresses the issue of the public option. meet a soldier who is making history at


=== microsoft/xprophetnet-large-wiki100-cased ===

architecture:	xlm_prophetnet
tokenizer:	XLMProphetNetTokenizer

*** TESTING DataLoaders ***



Unnamed: 0,text,target
0,"(CNN) -- Home to up to 10 percent of all known species, Mexico is recognized as one of the most biodiverse regions on the planet. The twin threats of climate change and human encroachment on natural environments are, however, threatening the existence of the country's rich wildlife. And there is a great deal to lose. In the United Nations Environment Program (UNEP) World Conservation Monitoring Centre's list of megadiverse countries Mexico ranks 11th. The list represents a group of 17 countries that harbor the majority of the Earth's species and are therefore considered extremely biodiverse. From its coral reefs in the Caribbean Sea to its tropical jungles in Chiapas and the Yucatan peninsula and its deserts and prairies in the north, Mexico boasts an incredibly rich variety of flora and fauna. Some 574 out of 717 reptile species found in Mexico -- the most in any country -- can only be encountered within its borders. It is home to 502 types of mammals, 290 species of birds, 1,150 vari","Mexico hosts to up to 10 percent of all known species on Earth. It is home to 502 types of mammals, 290 bird species and 26,000 types of plants"
1,"Police missed speaking with suspected Los Angeles International Airport shooter Paul Ciancia by ""a matter of minutes"" the day his family asked authorities to check on him after receiving disturbing messages, according to the chairman of the House Homeland Security Committee. By the time officers arrived at Ciancia's apartment Friday, he had already left -- as little as 45 minutes earlier -- for the airport, according to U.S. Rep. Michael McCaul. As authorities piece together Ciancia's actions leading up to the shooting, new details emerged Sunday from those close to the suspect. According to someone who knew Ciancia and his roommates well, Ciancia began asking one of his three roommates days before the shooting if he could get a ride to the airport. Ciancia told the roommate that his father, back in New Jersey, was sick and he needed to get home ""to go help take care of him,"" the source said. Ciancia didn't, however, indicate what day he needed to leave. On the day of the shooting, Cia",Paul Ciancia began asking for a ride to the airport days before the shooting. Police performing a welfare check at his family's request missed him by less than an hour.


In [None]:
#slow
#hide_input
test_results_df = pd.DataFrame(test_results, columns=['arch', 'tokenizer', 'model_name', 'result', 'error'])
display_df(test_results_df)

Unnamed: 0,arch,tokenizer,model_name,result,error
0,bart,BartTokenizerFast,facebook/bart-base,PASSED,
1,blenderbot_small,BlenderbotSmallTokenizer,facebook/blenderbot_small-90M,PASSED,
2,led,LEDTokenizerFast,allenai/led-base-16384,PASSED,
3,mt5,T5TokenizerFast,google/mt5-small,PASSED,
4,pegasus,PegasusTokenizerFast,google/pegasus-cnn_dailymail,PASSED,
5,t5,T5TokenizerFast,t5-small,PASSED,
6,prophetnet,ProphetNetTokenizer,microsoft/prophetnet-large-uncased,PASSED,
7,xlm_prophetnet,XLMProphetNetTokenizer,microsoft/xprophetnet-large-wiki100-cased,PASSED,


## Cleanup

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 01_data-core.ipynb.
Converted 01a_data-token-classification.ipynb.
Converted 01b_data-question-answering.ipynb.
Converted 01za_data-seq2seq-core.ipynb.
Converted 01zb_data-seq2seq-language-modeling.ipynb.
Converted 01zc_data-seq2seq-summarization.ipynb.
Converted 01zd_data-seq2seq-translation.ipynb.
Converted 02_modeling-core.ipynb.
Converted 02a_modeling-token-classification.ipynb.
Converted 02b_modeling-question-answering.ipynb.
Converted 02za_modeling-seq2seq-core.ipynb.
Converted 02zb_modeling-seq2seq-language-modeling.ipynb.
Converted 02zc_modeling-seq2seq-summarization.ipynb.
Converted 02zc_modeling-seq2seq-translation.ipynb.
Converted 99a_examples-multilabel.ipynb.
Converted index.ipynb.
