# GPT Example

In [1]:
# Import required libraries
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the model in evaluation mode to deactivate the DropOut modules
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [2]:
# If you have a GPU, put everything on cuda
# tokens_tensor = tokens_tensor.to('cuda')
# model.to('cuda')

# Encode a text inputs
text = "Iran has"
end = False
while not end:
    indexed_tokens = tokenizer.encode(text)

    # Convert indexed tokens in a PyTorch tensor
    tokens_tensor = torch.tensor([indexed_tokens])


    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor)
        predictions = outputs[0]

    # Get the predicted next sub-word
    predicted_index = torch.argmax(predictions[0, -1, :]).item()
    predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])

    text = predicted_text
    
    if predicted_text[-1] == '\n':
        end = True
        
# Print the predicted word
print(predicted_text)

Iran has been a major player in the fight against terrorism.



## BERT and XLMRoberta example

In [3]:
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from transformers import BertTokenizer, BertForMaskedLM

In [4]:
tokenizer = XLMRobertaTokenizer.from_pretrained('novinsh/xlm-roberta-large-toxicomments-12k')
model = XLMRobertaForMaskedLM.from_pretrained('novinsh/xlm-roberta-large-toxicomments-12k', output_attentions=False)
model.eval()

XLMRobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNo

In [5]:
tokenizer2 = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model2 = BertForMaskedLM.from_pretrained('bert-base-multilingual-uncased', output_attentions=False)
model2.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [6]:
import numpy as np

def duplicates(lst, item):
    return [i for i, x in enumerate(lst) if x == item]

sentence = ""
sentence_orig = "Ich bin" #request.form.get('text')
sentence_length = 5 #request.form.get('len')
decoding_type = 'left to right' #request.form.get('decoding_type')
domain_type = '' # request.form.get('domain_type')
filler = ' '.join(['MASK' for _ in range(int(sentence_length))])

if domain_type=='review':
    starter = '[REVIEW]'
else:
    starter = ''

if len(sentence_orig.strip())==0:
    sentence = "[CLS] "+ starter + " " + filler + " . [SEP]"
else:
    sentence = "[CLS] " + starter + " " + sentence_orig + " " + filler + " . [SEP]"

print (sentence)

tokenized_text = tokenizer2.tokenize(sentence)
idxs = duplicates(tokenized_text, 'mask')
for masked_index in idxs:
    tokenized_text[masked_index] = "[MASK]"

##### LOOP TO CREATE TEXT #####
generated = 0
full_sentence = []
while generated<int(sentence_length):
    mask_idxs = duplicates(tokenized_text, "[MASK]")

    if decoding_type=='left to right':
        focus_mask_idx = min(mask_idxs)
    else:
        focus_mask_idx = np.random.choice(mask_idxs)

    mask_idxs.pop(mask_idxs.index(focus_mask_idx))
    temp_tokenized_text = tokenized_text.copy()
    temp_tokenized_text = [j for i, j in enumerate(temp_tokenized_text) if i not in mask_idxs]
    temp_indexed_tokens = tokenizer2.convert_tokens_to_ids(temp_tokenized_text)
    ff = [idx for idx, i in enumerate(temp_indexed_tokens) if i==103]
    temp_segments_ids = [0]*len(temp_tokenized_text)
    tokens_tensor = torch.tensor([temp_indexed_tokens])
    segments_tensors = torch.tensor([temp_segments_ids])

    with torch.no_grad():
        outputs = model2(tokens_tensor, token_type_ids=segments_tensors)
        predictions = outputs[0]

    #TOP - k Sampling
    k=5
    predicted_index = np.random.choice(predictions[0, ff].argsort()[0][-k:]).item()
    predicted_token = tokenizer2.convert_ids_to_tokens([predicted_index])[0]
    tokenized_text[focus_mask_idx] = predicted_token
    generated += 1

' '.join(tokenized_text[1:-1]).replace('[ review ]','')

[CLS]  Ich bin MASK MASK MASK MASK MASK . [SEP]


'ich bin nicht allein gewesen sein kann .'

In [7]:
# tokenizer.unk_token_id
tokenizer.mask_token_id
# tokenizer.mask_token

250001

In [8]:
import numpy as np
import torch

def duplicates(lst, item):
    return [i for i, x in enumerate(lst) if x == item]

sentence = ""
sentence_orig = "This is stupid and " #request.form.get('text')
sentence_length = 10 #request.form.get('len')
decoding_type = 'left to right' #request.form.get('decoding_type')
domain_type = '' # request.form.get('domain_type')

filler = ' '.join([tokenizer.mask_token for _ in range(int(sentence_length))])

if domain_type=='toxic':
    starter = '[TOXIC]'
else:
    starter = ''

if len(sentence_orig.strip())==0:
    sentence = f"{tokenizer.cls_token} {starter} {filler} . {tokenizer.sep_token}"
else:
    sentence = f"{tokenizer.cls_token} {starter} {sentence_orig} {filler} . {tokenizer.sep_token}"

print (sentence)

tokenized_text = tokenizer.tokenize(sentence)

##### LOOP TO CREATE TEXT #####
generated = 0
full_sentence = []
while generated<int(sentence_length):
    mask_idxs = duplicates(tokenized_text, tokenizer.mask_token)

    if decoding_type=='left to right':
        focus_mask_idx = min(mask_idxs)
    else:
        focus_mask_idx = np.random.choice(mask_idxs)

    mask_idxs.pop(mask_idxs.index(focus_mask_idx))
    temp_tokenized_text = tokenized_text.copy()
    temp_tokenized_text = [j for i, j in enumerate(temp_tokenized_text) if i not in mask_idxs]
    temp_indexed_tokens = tokenizer.convert_tokens_to_ids(temp_tokenized_text)
    ff = [idx for idx, i in enumerate(temp_indexed_tokens) if i==tokenizer.mask_token_id]
    temp_segments_ids = [0]*len(temp_tokenized_text)
    tokens_tensor = torch.tensor([temp_indexed_tokens])
    segments_tensors = torch.tensor([temp_segments_ids])

    with torch.no_grad():
        outputs = model(tokens_tensor, token_type_ids=segments_tensors)
        predictions = outputs[0]

#     print(ff)
#     print(predictions[0, ff])
    #TOP - k Sampling
    k=5
    predicted_index = np.random.choice(predictions[0, ff].argsort()[0][-k:]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    tokenized_text[focus_mask_idx] = predicted_token
    generated += 1

generated_text = ' '.join(tokenized_text[1:-1]).replace('[ TOXIC ]','')
generated_text = ''.join([t for t in generated_text if t != '▁'])
generated_text

<s>  This is stupid and  <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> . </s>


'This is stupid and wrong ! ... ... , march , : - -  .'