# Modelo BERT

In [None]:
import numpy as np
import pandas as pd

from fastai.text.all import *
from fastai.basics import *
from fastai.callback.all import *

from transformers import AutoModelForMaskedLM, AutoModel, AutoModelForPreTraining, AutoTokenizer, AutoConfig

import os

In [None]:
os.getcwd()

## Datos

In [None]:
path = Path('/kaggle/working')
model_path = Path('models')
df = pd.read_csv('../input/es-wiki/books_dataset.csv')
df.head()

## Clases automáticas
Permiten crear diferentes modelos y tokenizers cambiando el nombre del modelo. En este caso usaremos BERT en español.

- `AutoModelForMaskedLM`: modelo de lenguaje.
- `AutoTokenizer`: el tokenizador.
- `AutoConfig`: define la arquitectura y la configuración (modificable).
- `model_name`: nombre del modelo. Consultar https://huggingface.co/models

In [None]:
model_name = 'dccuchile/bert-base-spanish-wwm-uncased' 
lm_model_class = AutoModelForMaskedLM 
config_dict = AutoConfig.from_pretrained(model_name)

## Tokenizer y vocab
`AutoTokenizer` devuelve un diccionario con `get_vocab`, hay que transformarlo a lista para Fastai.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer_vocab=tokenizer.get_vocab() 
tokenizer_vocab_ls = [k for k, v in sorted(tokenizer_vocab.items(), key=lambda item: item[1])]
print(f'Tokenizer: {tokenizer.__class__}')
print(f'Vocab length: {len(tokenizer_vocab_ls)}')

In [None]:
tokenizer.special_tokens_map

Encapsular el tokenizador dentro de este para ajustarlo a la entrada deseada por Fastai.

In [None]:
class FastHugsTokenizer():
    """ 
        transformer_tokenizer : takes the tokenizer that has been loaded from the tokenizer class
        model_name : model type set by the user
        max_seq_len : override default sequence length, typically 512 for bert-like models.
                           `transformer_tokenizer.max_len_single_sentence` and `transformer_tokenizer.max_len_sentences_pair` 
                           both account for the need to add additional special tokens, i.e. for RoBERTa-base 
                           max_len_single_sentence==510, leaving space for the 2 additional special tokens 
                           to be added for the model's default 512 positional embeddings
        pair : whether a single sentence (sequence) or pair of sentences are used

        Returns:
            - Tokenized text, up to the max sequence length set by the user or the tokenzier default
    """
    def __init__(self, transformer_tokenizer=None, model_name='roberta', max_seq_len=None, 
                 pretrained=True, pair=False, **kwargs): 
        self.model_name, self.tok, self.max_seq_len=model_name, transformer_tokenizer, max_seq_len
        if pretrained:
            if self.max_seq_len:
                if pair: assert self.max_seq_len<=self.tok.max_len_sentences_pair, 'WARNING: max_seq_len needs to be less than or equal to transformer_tokenizer.max_len_sentences_pair'
                else: assert self.max_seq_len<=self.tok.max_len_single_sentence, 'WARNING: max_seq_len needs to be less than or equal to transformer_tokenizer.max_len_single_sentence'
            else:
                if pair: self.max_seq_len=ifnone(max_seq_len, self.tok.max_len_sentences_pair) 
                else: self.max_seq_len=ifnone(max_seq_len, self.tok.max_len_single_sentence)

    def do_tokenize(self, o:str):
        """Returns tokenized text, adds prefix space if needed, limits the maximum sequence length"""
        if 'roberta' in model_name: tokens=self.tok.tokenize(o, add_prefix_space=True)[:self.max_seq_len-2]
        else: tokens = self.tok.tokenize(o)[:self.max_seq_len-2]
        return tokens
    
    def de_tokenize(self, o):
        """Return string from tokens"""
        text=self.tok.convert_tokens_to_string(o)
        return text
        
    def __call__(self, items): 
        for o in items: yield self.do_tokenize(o)

In [None]:
max_seq_len = None
sentence_pair = False

fasthugstok = FastHugsTokenizer(transformer_tokenizer=tokenizer, model_name=model_name, max_seq_len=max_seq_len, sentence_pair=sentence_pair)

Generamos una clase que hereda de `Tokenizer` de Fastai para decodificar los textos.

In [None]:
class MLMTokenizer(Tokenizer):
    def __init__(self, tokenizer, rules=None, counter=None, lengths=None, mode=None, sep=' ', **kwargs): 
        super().__init__(tokenizer, rules, counter, lengths, mode, sep)
    def _detokenize1(self, o):return self.tok.de_tokenize(o)
    def decodes(self, o): return TitledStr(str(self._detokenize1(o)))

Tokenizamos el texto usando las clases anteriores. Añadimos la regla `fix_html` para eliminar etiquetas HTML que hayan podido quedar en el texto.

In [None]:
fastai_tokenizer = MLMTokenizer.from_df(text_cols='text', tok=fasthugstok, 
                                     rules=[fix_html])
fastai_tokenizer.rules

Añadimos los tokens especiales.

In [None]:
class AddSpecialTokens(Transform):
    "Add special token_ids to the numericalized tokens for Sequence Classification"
    def __init__(self, tokenizer):
        self.tok=tokenizer
    def encodes(self, o):
        return(TensorText(self.tok.build_inputs_with_special_tokens(list(o))))

Funciones para crear el dataset para Fastai.

In [None]:
class MLMTokensLabels(Transform):
    '''
        MLM task
        - Select subset of input token ids, given by `mlm_probability`
        - Mask a subset of these, `mask_token_prob`
        - Replace half of the first subset with random tokens
        - This code most comes from the `mask_tokens` function here https://github.com/huggingface/transformers/blob/a21d4fa410dc3b4c62f93aa0e6bbe4b75a101ee9/examples/run_language_modeling.py#L66
        Returns: input ids and labels
    '''
    def __init__(self, tokenizer=None, mlm_probability=0.15, mask_token_prob=0.8):
        self.tok, self.mlm_probability, self.mask_token_prob=tokenizer, mlm_probability, mask_token_prob
    
    def _gen_probability_matrix(self, labels):
        # We sample a few tokens in each sequence for masked-LM training (with probability mlm_probability, defaults to 0.15 in Bert/RoBERTa)
        probability_matrix = torch.full(labels.shape, self.mlm_probability) 
        special_tokens_mask = self.tok.get_special_tokens_mask(labels.tolist(), already_has_special_tokens=True)
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
        if self.tok._pad_token is not None:
            padding_mask = labels.eq(self.tok.pad_token_id)
            probability_matrix.masked_fill_(padding_mask, value=0.0)
        return probability_matrix
    
    def _replace_with_mask(self, inputs, labels, masked_indices):
        # for `mask_token_prob`% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
        indices_replaced = torch.bernoulli(torch.full(labels.shape, self.mask_token_prob)).bool() & masked_indices
        inputs[indices_replaced] = self.tok.convert_tokens_to_ids(self.tok.mask_token)
        return inputs, indices_replaced
    
    def _replace_with_other(self, inputs, labels, masked_indices, indices_replaced):
        # 1-`mask_token_prob`)/210% of the time, we replace masked input tokens with random word
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tok), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]
        return inputs
    
    def encodes(self, inputs):
        if self.tok.mask_token is None:
            raise ValueError("This tokenizer does not have a mask token which is necessary for masked language modeling.")
        labels = inputs.clone()
        
        # Get probability of whether a token will be masked
        probability_matrix = self._gen_probability_matrix(labels)
        
        # Create random mask indices according to probability matrix
        masked_indices = torch.bernoulli(probability_matrix).bool()
        
        # Mask the labels for indices that are NOT masked, we only compute loss on masked tokens
        labels[~masked_indices] = -100  
        
        # Randomly replace with mask token
        inputs, indices_replaced = self._replace_with_mask(inputs, labels, masked_indices)
        
        # Randomly replace with mask token
        inputs = self._replace_with_other(inputs, labels, masked_indices, indices_replaced)
        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
        return (inputs,labels)

In [None]:
@Numericalize
def decodes(self,o):
    'Add the ability to parse masks for the loss function, set as `-100`'
    if isinstance(o, tuple): o=o[0]
    tmp_vocab=self.vocab.copy()
    tmp_vocab.append('<loss_mask>')
    o=[-1 if o_ == -100 else o_ for o_ in o]
    return L(tmp_vocab[o_] for o_ in o if tmp_vocab[o_] != PAD)

In [None]:
@delegates(Datasets)
class Datasets(Datasets):
    "Doesn't create a tuple in __getitem__ as x is already a tuple"
    def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs):
        super().__init__(items=items, tfms=tfms, tls=tls, n_inp=n_inp, dl_type=dl_type, **kwargs)

    def __getitem__(self, it):
        # same as Datasets.__getitem__ but not wrapped in a tuple
        res = [tl[it] for tl in self.tls]
        return res[0] if is_indexer(it) else list(zip(*res))

Creamos el dataset.

In [None]:
splitter = ColSplitter()(df)
tfms=[attrgetter("text"), fastai_tokenizer, Numericalize(vocab=tokenizer_vocab_ls), 
      AddSpecialTokens(tokenizer), MLMTokensLabels(tokenizer)]

dsets = Datasets(df, splits=splitter, tfms=[tfms], dl_type=SortedDL)

dsets[0][0][:20], dsets[0][1][:20]

Para el padding, comprobamos la configuración `tokenizer.padding_side` y lo ajustamos dependiendo si es a la derecha o a la izquierda.

In [None]:
def pad_mlm_input(samples, pad_idx=1, pad_fields=[0,1], pad_first=False, max_seq_len=None, backwards=False):
    "Function that collect `samples` and adds padding, modified `max_len_l` in fastai's `pad_input`"
    pad_fields = L(pad_fields)
    max_len_l = pad_fields.map(lambda f: max_seq_len)
    if backwards: pad_first = not pad_first
    def _f(field_idx, x):
        if isinstance(x, tuple): x=(x[0])
        if field_idx not in pad_fields: return x
        idx = pad_fields.items.index(field_idx)
        sl = slice(-len(x), sys.maxsize) if pad_first else slice(0, len(x))
        pad =  x.new_zeros(max_len_l[idx]-x.shape[0])+pad_idx
        x1 = torch.cat([pad, x] if pad_first else [x, pad])
        if backwards: x1 = x1.flip(0)
        return retain_type(x1, x)
    return [tuple(map(lambda idxx: _f(*idxx), enumerate(s))) for s in samples]

def transformer_mlm_padding(tokenizer=None, max_seq_len=None, sentence_pair=False): 
    'Uses `pad_fields=[0,1]` to pad both input and label'
    if tokenizer.padding_side == 'right': pad_first=False
    else: pad_first=True
    max_seq_len = ifnone(max_seq_len, tokenizer.model_max_length) 
    return partial(pad_mlm_input, pad_fields=[0,1], pad_first=pad_first, 
                   pad_idx=tokenizer.pad_token_id, max_seq_len=max_seq_len)

Creamos el dataloaders con la configuración adecuada.

In [None]:
padding = transformer_mlm_padding(tokenizer, max_seq_len=512)

bs = 4
dls = dsets.dataloaders(bs=bs, before_batch=[padding])

Comprobamos que la mayoría de los tokens en el target (`text_`) están ocultos. Solo queremos calcular la función de pérdida en el 15% de los tokens del texto (`text`) que están ocultos.

In [None]:
b = dls.one_batch()
b[0].size(), b[1].size()

In [None]:
dls.show_batch()

## Modelo

In [None]:
class LMModel(nn.Module):
    def __init__(self, lm_model_class=None, tokenizer=None, model_name=None, config_dict=None, pretrained=False):
        super().__init__()
        self.tok=tokenizer
        if pretrained: self.model = lm_model_class.from_pretrained(model_name)
        else: self.model = lm_model_class.from_config(config_dict)
        self.model = self.model.module if hasattr(self.model, "module") else self.model
        self.model.resize_token_embeddings(len(tokenizer))
            
    def forward(self, input_ids):
        attention_mask =  (input_ids!=self.tok.pad_token_id).type(input_ids.type())
        return self.model(input_ids, attention_mask=attention_mask)[0]

El modelo preentrenado se carga con `pretrained=True`.

In [None]:
model = LMModel(lm_model_class=lm_model_class, tokenizer=tokenizer, model_name=model_name, 
                  config_dict=config_dict, pretrained=True)

## Entrenamiento

In [None]:
opt_func = partial(Adam, decouple_wd=True)

loss = CrossEntropyLossFlat()

learn = Learner(dls, model, opt_func=opt_func,
                loss_func=loss, metrics=[accuracy, Perplexity()]).to_fp16()

In [None]:
valley = learn.lr_find()

In [None]:
learn.recorder.plot_lr_find()

In [None]:
learn.fit_one_cycle(10, valley.valley)

In [None]:
learn.save('beto_lm_pretrained_10ep')