In this notebook, we will take all the steps necessary to create a simple BERT-based pipeline for text generation using [Transformers](https://huggingface.co/transformers/index.html) and [DeepPavlov](https://deeppavlov.ai/) libraries.

In [None]:
!pip install deeppavlov==0.8.0 torch==1.4.0 transformers==2.8.0



In [None]:
from typing import List, Optional, Collection

import torch
from transformers import BertTokenizer, BertForMaskedLM

from deeppavlov import build_model
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

Define a DeepPavlov component to pre-process input text for the BERT model.  
[An existing TransformersBertPreprocessor class](https://github.com/deepmipt/DeepPavlov/blob/0.8.0/deeppavlov/models/preprocessors/transformers_preprocessor.py) does this and more but cannot work with paired texts.

In [None]:
@register('bert_encoder')
class TransformersBertEncoder(Component):
    def __init__(self, pretrained_model: str = 'bert-base-uncased', **kwargs):
        self.tokenizer: BertTokenizer = BertTokenizer.from_pretrained(pretrained_model)
        
    def __call__(self, texts_batch: List[str], text_pairs_batch: Optional[List[str]] = None):
        if text_pairs_batch is not None:
            data = list(zip(texts_batch, text_pairs_batch))
        else:
            data = texts_batch
        
        res = self.tokenizer.batch_encode_plus(data, pad_to_max_length=True, add_special_tokens=True, return_tensors='pt', return_attention_masks=True)
        return res['input_ids'], res['attention_mask'], res['token_type_ids']

A simple BERT-based class to generate a follow up of an initial text.  
It will sample tokens until `max_generated_tokens` is generated.

In [None]:
@register('bert_generator')
class TransformersBertGenerator(Component):
    def __init__(self, pretrained_model: str = 'bert-base-uncased',
                 max_generated_tokens: int = 15,
                 mask_token_id: int = 103, sep_token_id: int = 102, pad_token_id: int = 0, **kwargs):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model: BertForMaskedLM = BertForMaskedLM.from_pretrained(pretrained_model).to(self.device)
        self.max_generated_tokens = max_generated_tokens
        
        self.mask_tensor = torch.tensor(mask_token_id, device=self.device)
        self.sep_tensor = torch.tensor(sep_token_id, device=self.device)
        self.pad_tensor = torch.tensor(pad_token_id, device=self.device)
        
    @staticmethod
    def _sample(prediction_scores: torch.Tensor):
        # return prediction_scores.argmax(dim=-1)
        probas = torch.nn.functional.softmax(prediction_scores[:, 0], dim=-1)
        return torch.multinomial(probas, num_samples=1)
    
    def __call__(self, input_ids: torch.Tensor, attention_masks: torch.Tensor, token_type_ids: torch.Tensor):
        input_ids = input_ids.to(self.device)
        attention_masks = attention_masks.to(self.device)
        token_type_ids = token_type_ids.to(self.device)
        
        batch_size = torch.tensor(len(input_ids), device=self.device)
        with torch.no_grad():
            # indexes of all tokens that will be genertated
            mask_indexes = torch.arange(self.max_generated_tokens, device=self.device).expand([batch_size, -1]) + attention_masks.sum(dim=1).unsqueeze(1) - 1
            
            # expand attention masks and token types matrixes to accomodate for addtitional tokens
            attention_masks = torch.cat([attention_masks, torch.zeros([batch_size, self.max_generated_tokens], device=self.device, dtype=int)], dim=1)
            attention_masks.scatter_(1, mask_indexes+1, 1)
            token_type_ids = torch.cat([token_type_ids, torch.ones([batch_size, self.max_generated_tokens], device=self.device, dtype=int)], dim=1)
            
            # expand token ids matrixes with paddings
            input_ids = torch.cat([input_ids, self.pad_tensor.expand(batch_size, self.max_generated_tokens)], dim=1)
            # insert [MASK] and [SEP] tokens
            input_ids.scatter_(1, mask_indexes, self.mask_tensor)
            input_ids.scatter_(1, attention_masks.sum(dim=1).unsqueeze(1)-1, self.sep_tensor)
            
            # fill in masks one by one
            for i in range(self.max_generated_tokens):
                indexes = mask_indexes[:, i:i+1]
                prediction_scores = self.model.forward(input_ids, attention_masks, token_type_ids)[0]
                mask_predictions = prediction_scores.gather(1, indexes.unsqueeze(-1).expand((-1, -1, prediction_scores.shape[-1])))
                input_ids.scatter_(1, indexes, self._sample(mask_predictions))
        return input_ids.cpu().numpy()

A component for decoding output ids into tokens for second sentences.  
It will decode tokens until it meets a `'[SEP]'` token or one of the `stopwords`.

In [None]:
@register('bert_decoder')
class TransformersBertDecoder(Component):
    def __init__(self, tokenizer: BertTokenizer, stopwords: Collection[str] = ('.', '?', '!'), **kwargs):
        self.tokenizer = tokenizer
        self.stopwords = set(stopwords)
    
    def __call__(self, ids_batch: List[List[int]]):
        result = []
        
        for tokens_ids in ids_batch:
            all_tokens = iter(self.tokenizer.convert_ids_to_tokens(tokens_ids))
            # skip the first part
            for token in all_tokens:
                if token == '[SEP]':
                    break
            tokens = []
            # take tokens until finding `[SEP]` or one of the stopwords
            for token in all_tokens:
                if token == '[SEP]':
                    break
                tokens.append(token)
                if token in self.stopwords:
                    break
            result.append(' '.join(tokens).replace(' ##', '').replace('##', ''))
            
        return result

A DeepPavlov configuration for the whole pipeline.  
Read the DeepPavlov documentation for more information on [what it is](http://docs.deeppavlov.ai/en/0.8.0/intro/configuration.html) and how to [access custom components](http://docs.deeppavlov.ai/en/0.8.0/devguides/registry.html)

In [None]:
config = {
    'chainer': {
        'in': ['texts', 'suggestions'],
        'pipe': [
            {
                'class_name': 'bert_encoder',
                'id': 'encoder',
                'pretrained_model': '{PRETRAINED_MODEL}',
                'in': ['texts', 'suggestions'],
                'out': ['input_ids', 'attention_masks', 'token_type_ids']
            },
            {
                'class_name': 'bert_generator',
                'pretrained_model': '{PRETRAINED_MODEL}',
                'max_generated_tokens': 10,
                'mask_token_id': '#encoder.tokenizer.mask_token_id',
                'sep_token_id': '#encoder.tokenizer.sep_token_id',
                'pad_token_id': '#encoder.tokenizer.pad_token_id',
                'in': ['input_ids', 'attention_masks', 'token_type_ids'],
                'out': ['output_ids']
            },
            {
                'class_name': 'bert_decoder',
                'tokenizer': '#encoder.tokenizer',
                'stopwords': ['.', '!', '?'],
                'in': ['output_ids'],
                'out': ['result']
            }
        ],
        'out': ['result']
    },
    'metadata': {
        'variables': {
            'PRETRAINED_MODEL': 'bert-base-uncased'
        }
    }
}

Initialize the model and test it on some inputs

In [None]:
dp_model = build_model(config)

In [None]:
texts = [
    'DeepPavlov is an open source conversational AI framework.',
    'The inference can speed up multiple times if you switch from CPU to GPU usage.',
    'It is a period of civil war.'
]
suggestions = [
    'I think that it',
    'No result is an expected behavior and it means',
    'Rebel spaceships, striking from a hidden base, have won their first victory against'
]

results = dp_model(texts, suggestions)

print(*zip(texts, results), sep='\n')

('DeepPavlov is an open source conversational AI framework.', 'i think that it is a base for the service in general .')
('The inference can speed up multiple times if you switch from CPU to GPU usage.', 'no result is an expected behavior and it means no one will have the first planned event occurs .')
('It is a period of civil war.', 'rebel spaceships , striking from a hidden base , have won their first victory against a force , and later , a revolutionary coalition .')
