In [None]:
!pip install transformers

In [None]:
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch

In [None]:
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k >0: keep only top k tokens with highest probability (top-k filtering).
            top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
        Source:
            https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits

In [None]:
def sample_token(output):
    logits = output[..., -1, :].squeeze(0)
    logits = top_k_top_p_filtering(logits, top_k=10)
    log_probs = torch.softmax(logits, dim=-1)
    token = torch.multinomial(log_probs, num_samples=1)[0]

    return token

## Transformer-XL

In [None]:
tokenizer = AutoTokenizer.from_pretrained('transfo-xl-wt103')
model = AutoModelWithLMHead.from_pretrained('transfo-xl-wt103')

In [None]:
generated = tokenizer.encode("On our way to the beach")
context = torch.tensor([generated])
past = None

In [None]:
for i in range(100):
    output, past = model(context, mems=past)
    token = sample_token(output)

    generated.append(token.item())
    context = token.view(1, -1)

In [None]:
print(tokenizer.decode(generated))

## GPT-2

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
model = AutoModelWithLMHead.from_pretrained('gpt2-large')

In [None]:
generated = tokenizer.encode("On our way to the beach")
context = torch.tensor([generated])
past = None

In [None]:
for i in range(100):
    output, past = model(context, past=past)
    token = sample_token(output)

    generated.append(token.item())
    context = token.unsqueeze(0)

In [None]:
print(tokenizer.decode(generated))

## XLM

In [None]:
tokenizer = AutoTokenizer.from_pretrained('xlm-clm-enfr-1024')
model = AutoModelWithLMHead.from_pretrained('xlm-clm-enfr-1024')

In [None]:
generated = [0] # start with just <s>
context = torch.tensor([generated])
lang = 0 # English

In [None]:
for i in range(100):
    langs = torch.zeros_like(context).fill_(lang)
    output, = model(context, langs=langs)
    token = sample_token(output)

    generated.append(token.item())
    context = torch.tensor([generated])

In [None]:
print(tokenizer.decode(generated))

In [None]:
generated = [0] # start with just <s>
context = torch.tensor([generated])
lang = 1 # French

In [None]:
for i in range(100):
    langs = torch.zeros_like(context).fill_(lang)
    output, = model(context, langs=langs)
    token = sample_token(output)

    generated.append(token.item())
    context = torch.tensor([generated])

In [None]:
print(tokenizer.decode(generated))