In [None]:
# default_exp data

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# data

> This module contains the bits required to use the fastai DataBlock API and/or mid-level data processing pipelines to organize your data in a way modelable by huggingface transformer implementations.

In [None]:
#export
import ast
from functools import reduce

from blurr.utils import *

import torch
from transformers import *
from fastai2.text.all import *

In [None]:
#hide
import pdb

from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#cuda
torch.cuda.set_device(1)
print(f'Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}')

Using GPU #1: GeForce GTX 1080 Ti


## Base tokenization, batch transform, and DataBlock methods

In [None]:
#export
class HF_BaseInput(list): pass

The `HF_BaseInput` object is used to encapsulate all the inputs required by whatever huggingface model we are using. We use it as a container for the `input_ids`, `token_type_ids`, and `attention_mask` tensors required by most models, and also as a mean to customize @typedispatched functions like `DataLoaders.show_batch` and `Learner.show_results`.

In [None]:
#export
class HF_Tokenizer():
    """huggingface friendly tokenization function."""
    def __init__(self, hf_arch, hf_tokenizer, mode='str', list_split_func=str.split, **kwargs):
        store_attr(self, 'hf_arch, hf_tokenizer, mode, list_split_func')

    def __call__(self, items): 
        for txt in items: yield self._tokenize(txt)

    def _tokenize(self, txt):   
        if (self.mode == 'str'): 
            return self.hf_tokenizer.tokenize(txt)
        
        if (self.mode == 'list'):
            try: tokens = ast.literal_eval(txt)
            except: 
                tokens = self.list_split_func(txt)
            finally:
                return [sub_toks for entity in tokens for sub_toks in self.hf_tokenizer.tokenize(entity)]

`HF_Tokenizer` complies with the requirements of a basic tokenization function in fastai.  See [here](http://dev.fast.ai/text.core#Tokenizing).

We've updated the `_tokenize` method to operate on a string or a list (the later being very handy for tasks like token classification whereby the examples consist of a list of tokens and a list of labels for each).

In [None]:
#export
@typedispatch
def build_hf_input(task, tokenizer, a_tok_ids, b_tok_ids=None, targets=None,
                   max_length=512, pad_to_max_length=True, truncation_strategy='longest_first'):

    res = tokenizer.prepare_for_model(a_tok_ids, b_tok_ids, 
                                       max_length=max_length, pad_to_max_length=pad_to_max_length,
                                       truncation_strategy=truncation_strategy, return_tensors='pt')
    
    input_ids = res['input_ids'][0]
    token_type_ids = res['token_type_ids'][0] if ('token_type_ids' in res) else torch.tensor([-9999]) 
    attention_mask = res['attention_mask'][0] if ('attention_mask' in res) else torch.tensor([-9999]) 
    
    return HF_BaseInput([input_ids, token_type_ids, attention_mask]), targets
    

`build_hf_input` uses fastai's @typedispatched decorator to provide for complete flexibility in terms of how your numericalized tokens are assembled, and also what you return via `HF_BaseInput` and as your targets.  You can override this implementation as needed by assigning a type to the `task` argument (and optionally the `tokenizer` argument as well).

What you return here is what will be fed into your huggingface model.

In [None]:
#export
class HF_BatchTransform(Transform):
    """Handles everything you need to assemble a mini-batch of inputs and targets"""
    def __init__(self, hf_arch, hf_tokenizer, max_seq_len=512, truncation_strategy='longest_first', task=None):
        
        self.hf_arch = hf_arch
        self.hf_tokenizer = hf_tokenizer
        store_attr(self, 'max_seq_len, truncation_strategy, task')
        
    def encodes(self, samples):
        
        encoded_samples = []
        for idx, sample in enumerate(samples):
            
            if (isinstance(sample[0], tuple)):
                a_tok_ids = sample[0][0].tolist()
                b_tok_ids = sample[0][1].tolist()
            else:
                a_tok_ids = sample[0].tolist()
                b_tok_ids = None

            hf_base_input, targets = build_hf_input(self.task, self.hf_tokenizer, 
                                                    a_tok_ids, b_tok_ids, sample[1:],
                                                    self.max_seq_len, True, self.truncation_strategy)
            
            encoded_samples.append((hf_base_input, *targets))
            
        return encoded_samples
    

In [None]:
#export
class HF_TextBlock(TransformBlock):
    
    @delegates(Numericalize.__init__)
    def __init__(self, tok_tfms, hf_arch, hf_tokenizer, hf_batch_tfm=None, vocab=None, task=None,
                 max_seq_len=512, **kwargs):

        if hf_batch_tfm is None:
            hf_batch_tfm = HF_BatchTransform(hf_arch, hf_tokenizer, max_seq_len=max_seq_len,
                                             truncation_strategy='longest_first', task=task)
            
        return super().__init__(type_tfms=[*tok_tfms, Numericalize(vocab, **kwargs)],
                                dl_type=SortedDL, 
                                dls_kwargs={ 'before_batch': hf_batch_tfm })

    @classmethod
    @delegates(Tokenizer.from_df, keep=True)
    def from_df(cls, text_cols_lists, hf_arch, hf_tokenizer, hf_batch_tfm=None, vocab=None, task=None, 
                tok_func_mode='str', res_col_names=None, max_seq_len=512, **kwargs):
        """Creates a HF_TextBlock via a pandas DataFrame"""
        
        # grab hf tokenizer class to do the actual tokenization (via tok_func) and its vocab
        tokenizer_cls = partial(HF_Tokenizer, hf_arch=hf_arch, hf_tokenizer=hf_tokenizer, mode=tok_func_mode)
        if (vocab is None): vocab = list(hf_tokenizer.get_vocab())

        # build the column name(s) returned after tokenization
        if (res_col_names is None): res_col_names = [ f'text{i}' for i in range(len(text_cols_lists)) ] 
    
        tok_tfms = [ Tokenizer.from_df(text_cols, 
                                       res_col_name=res_col_name, 
                                       tok_func=tokenizer_cls,
                                       rules=[], **kwargs) 
                    for text_cols, res_col_name in zip(text_cols_lists, res_col_names) ]
  
        return cls(tok_tfms, hf_arch=hf_arch, hf_tokenizer=hf_tokenizer, hf_batch_tfm=hf_batch_tfm, 
                   vocab=vocab, task=task, max_seq_len=max_seq_len)

In [None]:
show_doc(HF_TextBlock.from_df)

<h4 id="HF_TextBlock.from_df" class="doc_header"><code>HF_TextBlock.from_df</code><a href="__main__.py#L16" class="source_link" style="float:right">[source]</a></h4>

> <code>HF_TextBlock.from_df</code>(**`text_cols_lists`**, **`hf_arch`**, **`hf_tokenizer`**, **`hf_batch_tfm`**=*`None`*, **`vocab`**=*`None`*, **`task`**=*`None`*, **`tok_func_mode`**=*`'str'`*, **`res_col_names`**=*`None`*, **`max_seq_len`**=*`512`*, **`tok_func`**=*`'SpacyTokenizer'`*, **`rules`**=*`None`*, **`sep`**=*`' '`*, **`n_workers`**=*`16`*, **`mark_fields`**=*`None`*, **`res_col_name`**=*`'text'`*, **\*\*`kwargs`**)

Creates a HF_TextBlock via a pandas DataFrame

Currently, we've only implemented building this block from a pandas DataFrame.  It handles single and multiple text inputs so that it can be used out-of-the-box against any model in the huggingface arsenal (e.g. sequence classification, question-answer, summarization, token classification, etc...).

## Examples

### Sequence classification (e.g., models that require a single text input)

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)

model_path = Path('models')
imdb_df = pd.read_csv(path/'texts.csv')

In [None]:
imdb_df.head()

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False
2,negative,"Every once in a long while a movie will come along that will be so awful that I feel compelled to warn people. If I labor all my days and I can save but one soul from watching this movie, how great will be my joy.<br /><br />Where to begin my discussion of pain. For starters, there was a musical montage every five minutes. There was no character development. Every character was a stereotype. We had swearing guy, fat guy who eats donuts, goofy foreign guy, etc. The script felt as if it were being written as the movie was being shot. The production value was so incredibly low that it felt li...",False
3,positive,"Name just says it all. I watched this movie with my dad when it came out and having served in Korea he had great admiration for the man. The disappointing thing about this film is that it only concentrate on a short period of the man's life - interestingly enough the man's entire life would have made such an epic bio-pic that it is staggering to imagine the cost for production.<br /><br />Some posters elude to the flawed characteristics about the man, which are cheap shots. The theme of the movie ""Duty, Honor, Country"" are not just mere words blathered from the lips of a high-brassed offic...",False
4,negative,"This movie succeeds at being one of the most unique movies you've seen. However this comes from the fact that you can't make heads or tails of this mess. It almost seems as a series of challenges set up to determine whether or not you are willing to walk out of the movie and give up the money you just paid. If you don't want to feel slighted you'll sit through this horrible film and develop a real sense of pity for the actors involved, they've all seen better days, but then you realize they actually got paid quite a bit of money to do this and you'll lose pity for them just like you've alr...",False


There are a bunch of ways we can get at the four huggingface elements we need (e.g., architecture name, tokenizer, config, and model).  We can just create them directly, or we can use one of the helper methods available via `BLURR_MODEL_HELPER`.

In [None]:
task = HF_TASKS_AUTO.ForSequenceClassification

pretrained_model_name = "roberta-base" # "distilbert-base-uncased" "bert-base-uncased"
config = AutoConfig.from_pretrained(pretrained_model_name)

hf_arch, hf_tokenizer, hf_config, hf_model = BLURR_MODEL_HELPER.get_auto_hf_objects(pretrained_model_name, 
                                                                                    task=task, 
                                                                                    config=config)

Once you have those elements, you can create your `DataBlock` as simple as the below. Note that you can use multiple columns in your DataFrame to make up the *single* text element required by `HF_TextBlock` below.

In [None]:
# single input
blocks = (
    HF_TextBlock.from_df(text_cols_lists=[['text']], hf_arch=hf_arch, hf_tokenizer=hf_tokenizer),
    CategoryBlock
)

dblock = DataBlock(blocks=blocks, 
                   get_x=lambda x: x.text0,
                   get_y=ColReader('label'), 
                   splitter=ColSplitter(col='is_valid'))

In [None]:
# dblock.summary(imdb_df)

In [None]:
dls = dblock.dataloaders(imdb_df, bs=4)

In [None]:
b = dls.one_batch(); len(b), len(b[0]), len(b[1]) 

(2, 3, 4)

In [None]:
b[0][0].shape, b[0][1].shape, b[0][2].shape, b[1].shape

(torch.Size([4, 512]),
 torch.Size([4, 1]),
 torch.Size([4, 512]),
 torch.Size([4]))

In [None]:
#export
@typedispatch
def show_batch(x:HF_BaseInput, y, samples, hf_tokenizer, skip_special_tokens=True, ctxs=None, max_n=6, **kwargs):        
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
    
    samples = L((TitledStr(hf_tokenizer.decode(inp, skip_special_tokens=skip_special_tokens).replace(hf_tokenizer.pad_token, '')),*s[1:]) 
                for inp, s in zip(x[0], samples))
    
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)

    display_df(pd.DataFrame(ctxs))
    return ctxs

In [None]:
dls.show_batch(hf_tokenizer=hf_tokenizer, max_n=2)

Unnamed: 0,text,category
0,"Raising Victor Vargas: A Review<br /><br />You know, Raising Victor Vargas is like sticking your hands into a big, steaming bowl of oatmeal. It's warm and gooey, but you're not sure if it feels right. Try as I might, no matter how warm and gooey Raising Victor Vargas became I was always aware that something didn't quite feel right. Victor Vargas suffers from a certain overconfidence on the director's part. Apparently, the director thought that the ethnic backdrop of a Latino family on the lower east side, and an idyllic storyline would make the film critic proof. He was right, but it didn't fool me. Raising Victor Vargas is the story about a seventeen-year old boy called, you guessed it, Victor Vargas (Victor Rasuk) who lives his teenage years chasing more skirt than the Rolling Stones could do in all the years they've toured. The movie starts off in `Ugly Fat' Donna's bedroom where Victor is sure to seduce her, but a cry from outside disrupts his plans when his best-friend Harold (Kevin Rivera) comes-a-looking for him. Caught in the attempt by Harold and his sister, Victor Vargas runs off for damage control. Yet even with the embarrassing implication that he's been boffing the homeliest girl in the neighborhood, nothing dissuades young Victor from going off on the hunt for more fresh meat. On a hot, New York City day they make way to the local public swimming pool where Victor's eyes catch a glimpse of the lovely young nymph Judy (Judy Marte), who's not just pretty, but a strong and independent too. The relationship that develops between Victor and Judy becomes the focus of the film. The story also focuses on Victor's family that is comprised of his grandmother or abuelita (Altagracia Guzman), his brother Nino (also played by real life brother to Victor, Silvestre Rasuk) and his sister Vicky (Krystal Rodriguez). The action follows Victor between scenes with Judy and scenes with his family. Victor tries to cope with being an oversexed pimp-daddy, his feelings for Judy and his grandmother's conservative Catholic upbringing.<br /><br />The problems that arise from Raising Victor Vargas are a few, but glaring errors. Throughout the film you get to know certain characters like Vicky, Nino, Grandma,",negative
1,"THE SHOP AROUND THE CORNER is one of the sweetest and most feel-good romantic comedies ever made. There's just no getting around that, and it's hard to actually put one's feeling for this film into words. It's not one of those films that tries too hard, nor does it come up with the oddest possible scenarios to get the two protagonists together in the end. In fact, all its charm is innate, contained within the characters and the setting and the plot... which is highly believable to boot. It's easy to think that such a love story, as beautiful as any other ever told, *could* happen to you... a feeling you don't often get from other romantic comedies, however sweet and heart-warming they may be. <br /><br />Alfred Kralik (James Stewart) and Clara Novak (Margaret Sullavan) don't have the most auspicious of first meetings when she arrives in the shop (Matuschek & Co.) he's been working in for the past nine years, asking for a job. They clash from the very beginning, mostly over a cigarette box that plays music when it's opened--he thinks it's a ludicrous idea; she makes one big sell of it and gets hired. Their bickering takes them through the next six months, even as they both (unconsciously, of course!) fall in love with each other when they share their souls and minds in letters passed through PO Box 237. This would be a pretty thin plotline to base an entire film on, except that THE SHOP AROUND THE CORNER is expertly fleshed-out with a brilliant supporting cast made up of entirely engaging characters, from the fatherly but lonely Hugo Matuschek (Frank Morgan) himself, who learns that his shop really is his home; Pirovitch (Felix Bressart), Kralik's sidekick and friend who always skitters out of the room when faced with the possibility of being asked for his honest opinion; smarmy pimp-du-jour Vadas (Joseph Schildkraut) who ultimately gets his comeuppance from a gloriously righteous Kralik; and ambitious errand boy Pepi Katona (William Tracy) who wants nothing more than to be promoted to the position of clerk for Matuschek & Co. The unpretentious love story between 'Dear Friends' is played out in this little shop in",positive


### Question Answering (e.g., models that require two text inputs)

We've provided a simple subset of a pre-processed SQUADv2 dataset below just for demonstration purposes. There is a lot that can be done to make this much better and more fully functional.  The idea here is just to show you how things can work for tasks beyond sequence classification. 

In [None]:
path = Path('./')
squad_df = pd.read_csv(path/'squad_sample.csv'); len(squad_df)

1000

In [None]:
squad_df.head(2)

Unnamed: 0,title,context,question_id,question_text,is_impossible,answer_text,answer_start,answer_end
0,New_York_City,"The New York City Fire Department (FDNY), provides fire protection, technical rescue, primary response to biological, chemical, and radioactive hazards, and emergency medical services for the five boroughs of New York City. The New York City Fire Department is the largest municipal fire department in the United States and the second largest in the world after the Tokyo Fire Department. The FDNY employs approximately 11,080 uniformed firefighters and over 3,300 uniformed EMTs and paramedics. The FDNY's motto is New York's Bravest.",56d1076317492d1400aab78c,What does FDNY stand for?,False,New York City Fire Department,4,33
1,Cyprus,"Following the death in 1473 of James II, the last Lusignan king, the Republic of Venice assumed control of the island, while the late king's Venetian widow, Queen Catherine Cornaro, reigned as figurehead. Venice formally annexed the Kingdom of Cyprus in 1489, following the abdication of Catherine. The Venetians fortified Nicosia by building the Venetian Walls, and used it as an important commercial hub. Throughout Venetian rule, the Ottoman Empire frequently raided Cyprus. In 1539 the Ottomans destroyed Limassol and so fearing the worst, the Venetians also fortified Famagusta and Kyrenia.",572e7f8003f98919007566df,In what year did the Ottomans destroy Limassol?,False,1539,481,485


In [None]:
max_seq_len= 512

In [None]:
squad_df = squad_df[(squad_df.answer_end < max_seq_len) & (squad_df.is_impossible == False)]

In [None]:
task = HF_TASKS_AUTO.ForQuestionAnswering

pretrained_model_name = "roberta-base"
config = AutoConfig.from_pretrained(pretrained_model_name)

hf_arch, hf_tokenizer, hf_config, hf_model = BLURR_MODEL_HELPER.get_auto_hf_objects(pretrained_model_name, 
                                                                                    task=task, 
                                                                                    config=config)

In [None]:
vocab = dict(enumerate(range(max_seq_len)));

Below we utilize the @typedispatch decorator to completely change how we'll tokenize the data for the `ForQuestionAnsweringTask`.

In [None]:
#export
@typedispatch
def build_hf_input(task:ForQuestionAnsweringTask, tokenizer, 
                   a_tok_ids, b_tok_ids=None, targets=None,
                   max_length=512, pad_to_max_length=True, truncation_strategy=None):
    
    if (truncation_strategy is None):
        truncation_strategy = "only_second" if tokenizer.padding_side == "right" else "only_first"

    res = tokenizer.prepare_for_model(a_tok_ids if tokenizer.padding_side == "right" else b_tok_ids, 
                                      b_tok_ids if tokenizer.padding_side == "right" else a_tok_ids,
                                      max_length=max_length, 
                                      pad_to_max_length=pad_to_max_length,
                                      truncation_strategy=truncation_strategy, 
                                      return_tensors='pt')
    
    input_ids = res['input_ids'][0]
    token_type_ids = res['token_type_ids'][0] if ('token_type_ids' in res) else torch.tensor([-9999]) 
    attention_mask = res['attention_mask'][0] if ('attention_mask' in res) else torch.tensor([-9999]) 
    
    return HF_BaseInput([input_ids, token_type_ids, attention_mask]), targets
    

And here we demonstrate some more of the extensibility bits of the framework, by passing in our own instance of `HF_BatchTransform`.

In [None]:
# (optional): override HF_BatchTransform defaults
hf_batch_tfm = HF_BatchTransform(hf_arch, hf_tokenizer, task=ForQuestionAnsweringTask(),
                                 max_seq_len=128, truncation_strategy=None)

blocks = (
    HF_TextBlock.from_df(text_cols_lists=[['question_text'],['context']],
                         hf_arch=hf_arch, 
                         hf_tokenizer=hf_tokenizer, 
                         hf_batch_tfm=hf_batch_tfm),
    CategoryBlock(vocab=vocab),
    CategoryBlock(vocab=vocab)
)

dblock = DataBlock(blocks=blocks, 
                   get_x=lambda x: (x.text0, x.text1),
                   get_y=[ColReader('answer_start'), ColReader('answer_end')],
                   splitter=RandomSplitter(),
                   n_inp=1)

In [None]:
# dblock.summary(squad_df)

In [None]:
dls = dblock.dataloaders(squad_df, bs=4)

In [None]:
b = dls.one_batch(); len(b), len(b[0]), len(b[1]), len(b[2])

(3, 3, 4, 4)

In [None]:
b[0][0].shape, b[0][1].shape, b[0][2].shape, b[1].shape, b[2].shape

(torch.Size([4, 128]),
 torch.Size([4, 1]),
 torch.Size([4, 128]),
 torch.Size([4]),
 torch.Size([4]))

In [None]:
dls.show_batch(hf_tokenizer=hf_tokenizer, skip_special_tokens=False, max_n=2)

Unnamed: 0,text,category,category_
0,"<s>In what century was the process of using hops to produce beer introduced to England?</s></s>Traditional English ale was made solely from fermented malt. The practice of adding hops to produce beer was introduced from the Netherlands in the early 15th century. Alehouses would each brew their own distinctive ale, but independent breweries began to appear in the late 17th century. By the end of the century almost all beer was brewed by commercial breweries.</s>",154,158
1,"<s>What did Khrushchev want Nasser to remove the ban on?</s></s>A day after announcing the attempt on his life, Nasser established a new provisional constitution proclaiming a 600-member National Assembly (400 from Egypt and 200 from Syria) and the dissolution of all political parties. Nasser gave each of the provinces two vice-presidents: Boghdadi and Amer in Egypt, and Sabri al-Asali and Akram al-Hawrani in Syria. Nasser then left for Moscow to meet with Nikita Khrushchev. At the meeting, Khrushchev pressed Nasser to lift the ban</s>",481,496


### Token classification (e.g., NER tasks)

In [None]:
# germ_eval_df = pd.read_csv('./data/task-token-classification/germeval2014ner/germeval2014ner_cleaned.csv')
germ_eval_df = pd.read_csv('./germeval2014_sample.csv')
germ_eval_df.head()

Unnamed: 0,pos,token,tag1,tag2,ds_type,seq_id,n_tokens
0,1,Schartau,B-PER,O,train,1,3
1,2,sagte,O,O,train,1,1
2,3,dem,O,O,train,1,1
3,4,"""",O,O,train,1,1
4,5,Tagesspiegel,B-ORG,O,train,1,3


In [None]:
germ_eval_df.dropna(inplace=True)
germ_eval_df[germ_eval_df.token.isna()]

Unnamed: 0,pos,token,tag1,tag2,ds_type,seq_id,n_tokens


In [None]:
labels = sorted(germ_eval_df.tag1.unique())
print(labels)

['B-LOC', 'B-LOCderiv', 'B-LOCpart', 'B-ORG', 'B-ORGpart', 'B-OTH', 'B-PER', 'B-PERpart', 'I-LOC', 'I-LOCderiv', 'I-ORG', 'I-ORGpart', 'I-OTH', 'I-PER', 'O']


In [None]:
task = HF_TASKS_AUTO.ForTokenClassification

pretrained_model_name = "bert-base-multilingual-cased"
config = AutoConfig.from_pretrained(pretrained_model_name)
config.num_labels = len(labels)

hf_arch, hf_tokenizer, hf_config, hf_model = BLURR_MODEL_HELPER.get_auto_hf_objects(pretrained_model_name, 
                                                                                    task=task, 
                                                                                    config=config)
hf_arch, type(hf_tokenizer), type(hf_config), type(hf_model)

('bert',
 transformers.tokenization_bert.BertTokenizer,
 transformers.configuration_bert.BertConfig,
 transformers.modeling_bert.BertForTokenClassification)

In [None]:
germ_eval_df = germ_eval_df.groupby(by='seq_id').agg(list).reset_index()
germ_eval_df.head()

Unnamed: 0,seq_id,pos,token,tag1,tag2,ds_type,n_tokens
0,1,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]","[Schartau, sagte, dem, "", Tagesspiegel, "", vom, Freitag, ,, Fischer, sei, "", in, einer, Weise, aufgetreten, ,, die, alles, andere, als, überzeugend, war, "", .]","[B-PER, O, O, O, B-ORG, O, O, O, O, B-PER, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train]","[3, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 1, 1, 1]"
1,2,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]","[Firmengründer, Wolf, Peter, Bree, arbeitete, Anfang, der, siebziger, Jahre, als, Möbelvertreter, ,, als, er, einen, fliegenden, Händler, aus, dem, Libanon, traf, .]","[O, B-PER, I-PER, I-PER, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-LOC, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train]","[3, 1, 1, 2, 1, 1, 1, 3, 1, 1, 4, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1]"
2,3,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]","[Ob, sie, dabei, nach, dem, Runden, Tisch, am, 23., April, in, Berlin, durch, ein, pädagogisches, Konzept, unterstützt, wird, ,, ist, allerdings, zu, bezweifeln, .]","[O, O, O, O, O, O, O, O, O, O, O, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train, train]","[1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 3, 1]"
3,4,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]","[Bayern, München, ist, wieder, alleiniger, Top-, Favorit, auf, den, Gewinn, der, deutschen, Fußball-Meisterschaft, .]","[B-ORG, I-ORG, O, O, O, O, O, O, O, O, O, B-LOCderiv, O, O]","[B-LOC, B-LOC, O, O, O, O, O, O, O, O, O, O, O, O]","[train, train, train, train, train, train, train, train, train, train, train, train, train, train]","[1, 1, 1, 1, 2, 2, 3, 1, 1, 1, 1, 1, 3, 1]"
4,5,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]","[Dabei, hätte, der, tapfere, Schlussmann, allen, Grund, gehabt, ,, sich, viel, früher, aufzuregen, .]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[train, train, train, train, train, train, train, train, train, train, train, train, train, train]","[1, 1, 1, 2, 2, 1, 1, 3, 1, 1, 1, 1, 3, 1]"


In [None]:
#export
class HF_TokenTensorCategory(TensorBase): pass

In [None]:
#export
class HF_TokenCategorize(Transform):
    "Reversible transform of a list of category string to `vocab` id"
    
    def __init__(self, vocab=None, ignore_token=None, ignore_token_id=None):  
        self.vocab = None if vocab is None else CategoryMap(vocab)
        self.ignore_token = '[xIGNx]' if ignore_token is None else ignore_token
        self.ignore_token_id = CrossEntropyLossFlat().ignore_index if ignore_token_id is None else ignore_token_id
        
        self.loss_func, self.order = CrossEntropyLossFlat(ignore_index=self.ignore_token_id), 1

    def setups(self, dsets):
        if self.vocab is None and dsets is not None: self.vocab = CategoryMap(dsets)
        self.c = len(self.vocab)

    def encodes(self, labels):
        ids = [[self.vocab.o2i[lbl]] + [self.ignore_token_id]*(n_subtoks-1) for lbl, n_subtoks in labels] 
        return HF_TokenTensorCategory(reduce(operator.concat, ids))
    
    def decodes(self, encoded_labels): 
        return Category([(self.vocab[lbl_id]) for lbl_id in encoded_labels if lbl_id != self.ignore_token_id ])

`HF_TokenCategorize` modifies the fastai `Categorize` transform in a couple of ways.  First, it allows your targets to consist of a `Category` ***per*** token, and second, it uses the idea of an `ignore_token` to mask subtokens that don't need a prediction.  For example, the target of special tokens (e.g., pad, cls, sep) are set to `ignore_token` as are subsequent sub-tokens of a given token should more than 1 sub-token make it up.

In [None]:
#export
def HF_TokenCategoryBlock(vocab=None, ignore_token=None, ignore_token_id=None):
    "`TransformBlock` for single-label categorical targets"
    
    return TransformBlock(type_tfms=HF_TokenCategorize(vocab=vocab, 
                                                       ignore_token=ignore_token,
                                                       ignore_token_id=ignore_token_id))

In [None]:
show_doc(HF_TokenCategoryBlock)

<h4 id="HF_TokenCategoryBlock" class="doc_header"><code>HF_TokenCategoryBlock</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>HF_TokenCategoryBlock</code>(**`vocab`**=*`None`*, **`ignore_token`**=*`None`*, **`ignore_token_id`**=*`None`*)

`TransformBlock` for single-label categorical targets

In [None]:
#export
@typedispatch
def build_hf_input(task:ForTokenClassificationTask, tokenizer, a_tok_ids, b_tok_ids=None, targets=None,
                   max_length=512, pad_to_max_length=True, truncation_strategy='longest_first'):

    res = tokenizer.prepare_for_model(a_tok_ids, b_tok_ids, 
                                      max_length=max_length, 
                                      pad_to_max_length=pad_to_max_length,
                                      truncation_strategy=truncation_strategy, 
                                      return_special_tokens_mask=True,
                                      return_tensors='pt')

    input_ids = res['input_ids'][0]
    token_type_ids = res['token_type_ids'][0] if ('token_type_ids' in res) else torch.tensor([-9999]) 
    attention_mask = res['attention_mask'][0] if ('attention_mask' in res) else torch.tensor([-9999]) 
    
    # we assume that first target = the categories we want to predict for each token
    if (len(targets) > 0):
        target_cls = type(targets[0])
        idx_first_input_id = res['special_tokens_mask'].index(0)
        targ_ids = target_cls([ el*-100 if (el == 1) else targets[0][idx-idx_first_input_id].item() 
                    for idx, el in enumerate(res['special_tokens_mask']) ])

        # just in case there are other targets, we modify the first with the padded targ_ids
        updated_targets = list(targets)
        updated_targets[0] = targ_ids
    else:
        updated_targets= list(targets)
    
    return HF_BaseInput([input_ids, token_type_ids, attention_mask]), tuple(updated_targets)

We need a custom `build_hf_input` because we need to align the target tokens with the input tokens (e.g., if there are 512 input tokens there need to be 512 targets)

In [None]:
# single input
blocks = (
    HF_TextBlock.from_df(text_cols_lists=[['token']], 
                         hf_arch=hf_arch, 
                         hf_tokenizer=hf_tokenizer, 
                         tok_func_mode='list', 
                         task=ForTokenClassificationTask()),
    HF_TokenCategoryBlock(vocab=labels)
)

def get_y(inp):
    return [ (label, len(hf_tokenizer.tokenize(str(entity)))) for entity, label in zip(inp.token, inp.tag1) ]

dblock = DataBlock(blocks=blocks, 
                   get_x=lambda x: x.text0,
                   get_y=get_y,
                   splitter=RandomSplitter())

Note in the example above we had to define a `get_y` in order to return both the entity we want to predict a category for, as well as, how many subtokens are used by the `hf_tokenizer` to represent it.  This is necessary for the input/target alignment discussed above.

In [None]:
# dblock.summary(test_df)

In [None]:
dls = dblock.dataloaders(germ_eval_df, bs=4)

In [None]:
b = dls.one_batch()

In [None]:
len(b), b[0][0].shape, b[1].shape

(2, torch.Size([4, 512]), torch.Size([4, 512]))

In [None]:
dls.show_batch(hf_tokenizer=hf_tokenizer, max_n=2)

Unnamed: 0,text,category
0,"Das SS - Freiwilligen - Grenadier - Regiment 88 wurde im März aus einer Kampfgruppe der SS - Führerschule des Wirtschafts - und Verwaltungsdienstes und Teilen des I. / SS - Polizei - Regiments 34 der Ordnungspolizei, Heeresangehörigen und Volkssturm gebildet.","['O', 'B-ORGpart', 'I-ORGpart', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORGpart', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORGpart', 'I-ORGpart', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"
1,"Scenes of a Sexual Nature ( GB 2006 ) - Regie : Ed Blum Shortbus ( USA 2006 ) - Regie : John Cameron Mitchell : Film über den gleichnamigen New Yorker Club, der verschiedensten Paaren eine Plattform zur Aufarbeitung ihrer Probleme bietet.","['B-OTH', 'I-OTH', 'I-OTH', 'I-OTH', 'I-OTH', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'B-OTH', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'B-LOCderiv', 'I-LOCderiv', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']"


## Cleanup

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 01_data.ipynb.
Converted 02_modeling.ipynb.
Converted index.ipynb.
