In [None]:
# default_exp data

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data

> This module contains the bits required to use the fastai DataBlock API and/or mid-level data processing pipelines

In [None]:
#export
from blurr.utils import *

import torch
from transformers import *
from fastai2.text.all import *

In [None]:
#hide
import pdb

from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#cuda
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


In [None]:
#export
class HF_BaseInput(list): pass

In [None]:
#export
class HF_Tokenizer():
    def __init__(self, hf_arch, hf_tokenizer, **kwargs):
        self.hf_arch = hf_arch
        self.hf_tokenizer = hf_tokenizer
        
    def __call__(self, items): 
        for txt in items: yield self._tokenize(txt)

    def _tokenize(self, txt):
        return self.hf_tokenizer.tokenize(txt)

In [None]:
#export
class HF_BatchTransform(Transform):
    
    def __init__(self, hf_arch, hf_tokenizer, max_seq_len=512, truncation_strategy='longest_first'):
        
        self.hf_arch = hf_arch
        self.hf_tokenizer = hf_tokenizer
        store_attr(self, 'max_seq_len, truncation_strategy')
        
    def encodes(self, samples):
        
        encoded_samples = []
        for idx, sample in enumerate(samples):
            
            if (isinstance(sample[0], tuple)):
                a_tok_ids = sample[0][0].tolist()
                b_tok_ids = sample[0][1].tolist()
            else:
                a_tok_ids = sample[0].tolist()
                b_tok_ids = None
            
            res = self.hf_tokenizer.prepare_for_model(a_tok_ids, b_tok_ids, 
                                                      max_length=self.max_seq_len, 
                                                      pad_to_max_length=True,
                                                      truncation_strategy=self.truncation_strategy,
                                                      return_tensors='pt')
            
            input_ids = res['input_ids'][0]
            token_type_ids = res['token_type_ids'][0] if ('token_type_ids' in res) else torch.tensor([-9999]) 
            attention_mask = res['attention_mask'][0] if ('attention_mask' in res) else torch.tensor([-9999]) 

            inputs = [input_ids, token_type_ids, attention_mask]
            targets = sample[1:]
            
            encoded_samples.append((HF_BaseInput(inputs), *targets))
            
        return encoded_samples

In [None]:
#export
class HF_TextBlock(TransformBlock):
    
    @delegates(Numericalize.__init__)
    def __init__(self, tok_tfms, hf_arch, hf_tokenizer, 
                 hf_batch_tfm=None, vocab=None, max_seq_len=512, **kwargs):

        if hf_batch_tfm is None:
            hf_batch_tfm = HF_BatchTransform(hf_arch, hf_tokenizer, max_seq_len=max_seq_len,
                                             truncation_strategy='longest_first')
            
        return super().__init__(type_tfms=[*tok_tfms, Numericalize(vocab, **kwargs)],
                                dl_type=SortedDL, 
                                dls_kwargs={ 'before_batch': hf_batch_tfm })

    @classmethod
    @delegates(Tokenizer.from_df, keep=True)
    def from_df(cls, text_cols_lists, hf_arch, hf_tokenizer,
                res_col_names=None, vocab=None, 
                hf_batch_tfm=None, max_seq_len=512, **kwargs):
        
        # grab hf tokenizer class to do the actual tokenization (via tok_func) and its vocab
        tokenizer_cls = partial(HF_Tokenizer, hf_arch=hf_arch, hf_tokenizer=hf_tokenizer)
        if (vocab is None): vocab = list(hf_tokenizer.get_vocab())

        # build the column name(s) returned after tokenization
        if (res_col_names is None): res_col_names = [ f'text{i}' for i in range(len(text_cols_lists)) ] 
    
        tok_tfms = [ Tokenizer.from_df(text_cols, 
                                       res_col_name=res_col_name, 
                                       tok_func=tokenizer_cls,
                                       rules=[], **kwargs) 
                    for text_cols, res_col_name in zip(text_cols_lists, res_col_names) ]
  
        return cls(tok_tfms, hf_arch=hf_arch, hf_tokenizer=hf_tokenizer, 
                   hf_batch_tfm=hf_batch_tfm, vocab=vocab, max_seq_len=max_seq_len)

Example when Huggingface model requires a single text item (e.g., sequence classification)

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)

model_path = Path('models')
imdb_df = pd.read_csv(path/'texts.csv')

In [None]:
imdb_df.head()

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False
2,negative,"Every once in a long while a movie will come along that will be so awful that I feel compelled to warn people. If I labor all my days and I can save but one soul from watching this movie, how great will be my joy.<br /><br />Where to begin my discussion of pain. For starters, there was a musical montage every five minutes. There was no character development. Every character was a stereotype. We had swearing guy, fat guy who eats donuts, goofy foreign guy, etc. The script felt as if it were being written as the movie was being shot. The production value was so incredibly low that it felt li...",False
3,positive,"Name just says it all. I watched this movie with my dad when it came out and having served in Korea he had great admiration for the man. The disappointing thing about this film is that it only concentrate on a short period of the man's life - interestingly enough the man's entire life would have made such an epic bio-pic that it is staggering to imagine the cost for production.<br /><br />Some posters elude to the flawed characteristics about the man, which are cheap shots. The theme of the movie ""Duty, Honor, Country"" are not just mere words blathered from the lips of a high-brassed offic...",False
4,negative,"This movie succeeds at being one of the most unique movies you've seen. However this comes from the fact that you can't make heads or tails of this mess. It almost seems as a series of challenges set up to determine whether or not you are willing to walk out of the movie and give up the money you just paid. If you don't want to feel slighted you'll sit through this horrible film and develop a real sense of pity for the actors involved, they've all seen better days, but then you realize they actually got paid quite a bit of money to do this and you'll lose pity for them just like you've alr...",False


In [None]:
task = HF_TASKS_AUTO.ForSequenceClassification

pretrained_model_name = "roberta-base" # "distilbert-base-uncased" "bert-base-uncased"
config = AutoConfig.from_pretrained(pretrained_model_name)

hf_arch, hf_tokenizer, hf_config, hf_model = BLURR_MODEL_HELPER.get_auto_hf_objects(pretrained_model_name, 
                                                                                    task=task, 
                                                                                    config=config)

In [None]:
# single input
blocks = (
    HF_TextBlock.from_df(text_cols_lists=[['text']], hf_arch=hf_arch, hf_tokenizer=hf_tokenizer),
    CategoryBlock
)

dblock = DataBlock(blocks=blocks, 
                   get_x=lambda x: x.text0,
                   get_y=ColReader('label'), 
                   splitter=ColSplitter(col='is_valid'))

In [None]:
# dblock.summary(imdb_df)

In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)

In [None]:
b = dls.one_batch(); len(b), len(b[0]), len(b[1]) 

(2, 3, 4)

In [None]:
b[0][0].shape, b[0][1].shape, b[0][2].shape, b[1].shape

(torch.Size([4, 512]),
 torch.Size([4, 1]),
 torch.Size([4, 512]),
 torch.Size([4]))

In [None]:
#export
@typedispatch
def show_batch(x:HF_BaseInput, y, samples, hf_tokenizer, ctxs=None, max_n=6, **kwargs):        
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
        
    samples = samples = L((TitledStr(hf_tokenizer.decode(inp)),*s[1:]) for inp, s in zip(x[0], samples))
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)

    display_df(pd.DataFrame(ctxs))
    return ctxs

In [None]:
dls.show_batch(hf_tokenizer=hf_tokenizer, max_n=2)

Unnamed: 0,text,category
0,"<s>Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower east side, and an idyllic storyline would make the film critic proof. He was right, but it didn't fool me. Raising Victor Vargas is the story about a seventeen-year old boy called, you guessed it, Victor Vargas (Victor Rasuk) who lives his teenage years chasing more skirt than the Rolling Stones could do in all the years they've toured. The movie starts off in `Ugly Fat' Donna's bedroom where Victor is sure to seduce her, but a cry from outside disrupts his plans when his best-friend Harold (Kevin Rivera) comes-a-looking for him. Caught in the attempt by Harold and his sister, Victor Vargas runs off for damage control. Yet even with the embarrassing implication that he's been boffing the homeliest girl in the neighborhood, nothing dissuades young Victor from going off on the hunt for more fresh meat. On a hot, New York City day they make way to the local public swimming pool where Victor's eyes catch a glimpse of the lovely young nymph Judy (Judy Marte), who's not just pretty, but a strong and independent too. The relationship that develops between Victor and Judy becomes the focus of the film. The story also focuses on Victor's family that is comprised of his grandmother or abuelita (Altagracia Guzman), his brother Nino (also played by real life brother to Victor, Silvestre Rasuk) and his sister Vicky (Krystal Rodriguez). The action follows Victor between scenes with Judy and scenes with his family. Victor tries to cope with being an oversexed pimp-daddy, his feelings for Judy and his grandmother's conservative Catholic upbringing.<br /><br />The problems that arise from Raising Victor Vargas are a few, but glaring errors. Throughout the film you get to know certain characters like Vicky, Nino, Grandma,</s>",negative
1,"<s>With its companion piece MASTERS OF HORROR, NIGHTMARES AND DREAMSCAPES can only be seen as the absolute nadir of the genre that began so auspiciously with THE TWILIGHT ZONE and THE OUTER LIMITS.<br /><br />Of course, part of the problem is that it does nothing to be of any interest to a comparatively adult audience, instead aiming at TEN-YEAR-OLDS, who are only able to count body-bags, and scarcely that. And so grossness is king, and King is grossness.<br /><br />Stephen King is simply illiterate  in general he has the aptitude for storytelling of Bart Simpson. Since he cannot read his sole inspiration is the movies.<br /><br />True, the cinema is not such a bad place to start, since it has generally escaped the onslaught of ""Realism"". But these films are only the rumor, not the thing, and if you want to WRITE, you have to dig deeper.<br /><br />Of course, only PICKMAN had monsters as close acquaintances. But even so, it should be clear to any undergraduate that vampires are not Dracula and Lugosi.<br /><br />At least AUTOPSY ROOM FOUR is a clear indication of what is wrong. One can almost imagine this pathetic dolt sitting as his desk trying to come up with something SCARY.<br /><br />Not, mind you, trying to describe accurately the horror of the system of which he is an integral part, making the stupid stupider, but trying to come up with a scary story for his little nephew. Suppose, you were paralyzed, and people thought you were dead and started to cut you open like they do at those autopsy things! Wouldn't that be gross? And that, boys and girls, is the story.<br /><br />What about characterization? Oh yes, he's one of these suits, who never really appreciated life, you know, and now it's too late, right? And he's shouting  well, they can't actually hear him, you know  he's saying that he's going to sue the hospital, but he's not such a big shot anymore, you see, lying there (or is it laying, I can never remember) and all. And he's thinking: Oh no please, please don't cut me and this is terrible, lying (or laying) like that </s>",negative


Example when Huggingface model requires a two text item (e.g., question answering)

In [None]:
path = Path('./data/task-question-answering/squad/')
squad_df = pd.read_csv(path/'squad_cleaned_sample.csv'); len(squad_df)

50000

In [None]:
squad_df.head(2)

Unnamed: 0,title,context,question_id,question_text,is_impossible,answer_text,answer_start,answer_end
0,Bermuda,"The professionals soon displaced the amateur ex-Public schoolboys. Bermuda's role as the primary Royal Navy base in the Western Hemisphere, with an army garrison to match, ensured that the naval and military officers quickly introduced the newly formalised sports to Bermuda, including cricket, football, Rugby football, and even tennis and rowing (rowing did not adapt well from British rivers to the stormy Atlantic. The officers soon switched to sail racing, founding the Royal Bermuda Yacht Club). Once these sports reached Bermuda, they were eagerly adopted by Bermudians.",5ad42b72604f3c001a400930,Who founded the Bermuda Royal Yacht Club?,True,naval and military officers,189,216
1,Black_people,"Additionally, there are around 60,000 non-Jewish African immigrants in Israel, some of whom have sought asylum. Most of the migrants are from communities in Sudan and Eritrea, particularly the Niger-Congo-speaking Nuba groups of the southern Nuba Mountains; some are illegal immigrants.",5706b77e0eeca41400aa0d8f,Where are most of them from?,False,Sudan and Eritrea,157,174


In [None]:
max_seq_len= 512

In [None]:
squad_df = squad_df[(squad_df.answer_end < max_seq_len) & (squad_df.is_impossible == False)]

In [None]:
task = HF_TASKS_AUTO.ForQuestionAnswering

pretrained_model_name = "roberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)

hf_arch, hf_tokenizer, hf_config, hf_model = BLURR_MODEL_HELPER.get_auto_hf_objects(pretrained_model_name, 
                                                                                    task=task, 
                                                                                    config=config)

In [None]:
vocab = dict(enumerate(range(max_seq_len)));

In [None]:
# (optional): override HF_BatchTransform defaults
hf_batch_tfm = HF_BatchTransform(hf_arch, hf_tokenizer, max_seq_len=128, truncation_strategy='longest_first')

blocks = (
    HF_TextBlock.from_df(text_cols_lists=[['context'],['question_text']], 
                         hf_arch=hf_arch, hf_tokenizer=hf_tokenizer, hf_batch_tfm=hf_batch_tfm),
    CategoryBlock(vocab=vocab),
    CategoryBlock(vocab=vocab)
)

dblock = DataBlock(blocks=blocks, 
                   get_x=lambda x: (x.text0, x.text1),
                   get_y=[ColReader('answer_start'), ColReader('answer_end')],
                   splitter=RandomSplitter(),
                   n_inp=1)

In [None]:
# dblock.summary(squad_df)

In [None]:
dls = dblock.dataloaders(squad_df, bs=4)

In [None]:
b = dls.one_batch(); len(b), len(b[0]), len(b[1]), len(b[2])

(3, 3, 4, 4)

In [None]:
b[0][0].shape, b[0][1].shape, b[0][2].shape, b[1].shape, b[2].shape

(torch.Size([4, 128]),
 torch.Size([4, 1]),
 torch.Size([4, 128]),
 torch.Size([4]),
 torch.Size([4]))

In [None]:
dls.show_batch(hf_tokenizer=hf_tokenizer, max_n=2)

Unnamed: 0,text,category,category_
0,"<s>In 2013 tourism within the state from local citizens accounted for 39.9% of tourists, the second highest originating location for tourists to Tennessee is the state of Georgia, accounting for 8.4% of tourists.:17 Forty-four percent of stays in the state were ""day trips"", 25% stayed one night, 15% stayed two nights, and 11% stayed 4 or more nights. The average stay was 2.16 nights, compared to 2.03 nights for the US as a whole.:40 The average person spent $</s></s>What percentage of out-of-state tourists stayed in Tennessee for four or more nights?</s>",321,324
1,"<s>KU's Edwards Campus is in Overland Park, Kansas. Established in 1993, its goal is to provide adults with the opportunity to complete college degrees. About 2,100 students attend the Edwards Campus, with an average age of 32. Programs available at the Edwards Campus include developmental psychology, public administration, social work, systems analysis, information technology, engineering management and design.</s></s>When was the Edwards Campus built?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>",64,68


In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 01_data.ipynb.
Converted index.ipynb.
