In [1]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

In [8]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

In [9]:
tokenizer.add_special_tokens({
    'additional_special_tokens': ['[unused1]']
})

0

In [14]:
bert = TFAutoModelForSequenceClassification.from_pretrained('../models/ft_bert_cls/', 
                                                            num_labels=tokenizer.vocab_size)

Some layers from the model checkpoint at ../models/ft_bert_cls/ were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ../models/ft_bert_cls/ and are newly initialized: ['dropout_189']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenized_input = tokenizer([review_text], [whole_title], return_tensors='tf')
    print(tokenizer.decode(tokenized_input['input_ids'][0]))
    logits = bert(tokenized_input)[0][0]

In [25]:
_CLS, _MASK, _EOS, _SEP  = tokenizer.encode('[MASK] [unused1]')

In [37]:
def bert_write(review_text):
    global _MASK, _EOS
    whole_title = '[MASK]'
    whole_title_ids = [_MASK]

    for _ in range(11):
        tokenized_input = tokenizer([review_text], [whole_title], return_tensors='tf')
        print(tokenizer.decode(tokenized_input['input_ids'][0]))
        logits = bert(tokenized_input)[0][0]
        # MASK MODEL
        # best_token = logits[tokenized_input['input_ids'][0] == _MASK].numpy().argmax()
        # CLS
        best_token = logits.numpy().argmax()

        if best_token == _EOS:
            break

        tokenized_input_ids = tokenized_input['input_ids'].numpy()[0, :-2].tolist()

        tokenized_input_ids += [best_token]
        sep_break = tokenized_input_ids.index(_SEP)
        whole_title = tokenizer.decode(tokenized_input_ids[(sep_break+1):])

        whole_title += ' [MASK]'

    return whole_title.replace('[MASK]', '').strip()

In [37]:
def bert_mask_write(review_text):
    global _MASK, _EOS
    whole_title = '[MASK]'
    whole_title_ids = [_MASK]

    for _ in range(11):
        tokenized_input = tokenizer([review_text], [whole_title], return_tensors='tf')

        print(tokenizer.decode(tokenized_input['input_ids'][0]))
        
        logits = bert(tokenized_input)[0][0]
        best_token = logits[tokenized_input['input_ids'][0] == _MASK].numpy().argmax()

        if best_token == _EOS:
            break

        tokenized_input_ids = tokenized_input['input_ids'].numpy()[0, :-2].tolist()

        tokenized_input_ids += [best_token]
        sep_break = tokenized_input_ids.index(_SEP)
        whole_title = tokenizer.decode(tokenized_input_ids[(sep_break+1):])

        whole_title += ' [MASK]'

    return whole_title.replace('[MASK]', '').strip()

In [126]:
import numpy as np

In [273]:
def bert_cls_write(review_texts):
    global _MASK, _EOS
    whole_titles = ['[MASK]']*len(review_texts)
    whole_titles_ids = [[_MASK] for _ in range(len(review_texts))]

    # Obtém o tamanho (em tokens) do maior review
    review_max_len = max(len(review) for review in tokenizer(review_texts, add_special_tokens=False)['input_ids'])
    
    for _ in range(11):
        tokenized_input = tokenizer(review_texts, whole_titles, return_tensors='tf', padding='max_length',
                                     max_length=review_max_len + 11 + 4)       

        logits = bert(tokenized_input)[0]
        best_tokens = logits.numpy().argmax(axis=-1)

        tokenized_input_ids = tokenized_input['input_ids'].numpy()
        
        for text_nb, input_ids in enumerate(tokenized_input_ids):
            mask_location = np.where(input_ids == _MASK)[0].item()
            tokenized_input_ids[text_nb][mask_location] = best_tokens[text_nb]
        
        # Remove os tokens antes do primeiro [SEP]
        tokenized_input_ids = [
            input_ids[input_ids.index(_SEP)+1:] for input_ids in tokenized_input_ids.tolist()
        ]
        
        # Remove os tokens depois do último [SEP]
        tokenized_input_ids = [
            input_ids[:input_ids.index(_SEP)] for input_ids in tokenized_input_ids
        ]
        
        whole_titles = [tokenizer.decode(input_ids) + ' [MASK]'
                        for text_nb, input_ids in enumerate(tokenized_input_ids)]
    
    return [
        whole_title[:whole_title.find('[unused1]')].replace('[MASK]', '').strip() for whole_title in whole_titles
    ]

In [278]:
bert_cls_write(['Excelente qualidade amei', 'Gostei muito, porém é meio raro']*10)

['Gostei muito do produto',
 'Ótimo',
 'Gostei muito do produto',
 'Ótimo',
 'Gostei muito do produto',
 'Ótimo',
 'Gostei muito do produto',
 'Ótimo',
 'Gostei muito do produto',
 'Ótimo',
 'Gostei muito do produto',
 'Ótimo',
 'Gostei muito do produto',
 'Ótimo',
 'Gostei muito do produto',
 'Ótimo',
 'Gostei muito do produto',
 'Ótimo',
 'Gostei muito do produto',
 'Ótimo']