## Setup

In [None]:
!pip list

Package                       Version
----------------------------- ------------------------------
absl-py                       1.1.0
alabaster                     0.7.12
albumentations                0.1.12
altair                        4.2.0
appdirs                       1.4.4
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
arviz                         0.12.1
astor                         0.8.1
astropy                       4.3.1
astunparse                    1.6.3
atari-py                      0.2.9
atomicwrites                  1.4.0
attrs                         21.4.0
audioread                     2.1.9
autograd                      1.4
Babel                         2.10.1
backcall                      0.2.0
beautifulsoup4                4.6.3
bleach                        5.0.0
blis                          0.4.1
bokeh                         2.3.3
Bottleneck                    1.3.4
branca                        0.5.0
bs4                           0.

### VM setup

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd "/content/gdrive/MyDrive/Master Thesis/Language Model Training"

/content/gdrive/MyDrive/Master Thesis/Language Model Training


In [None]:
!pip install transformers==4.17

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.17
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 24.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.2 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 56.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 37.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 9.1 MB/s 
Building wheels for collected packa

In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.3.0-py3-none-any.whl (361 kB)
[K     |████████████████████████████████| 361 kB 32.8 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 72.8 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 57.7 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 67.4 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |██████████████████

### Imports

In [None]:
from pathlib import Path

import torch

import transformers
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import RobertaTokenizerFast
from transformers import RobertaForMaskedLM
from transformers import FillMaskPipeline

import datasets
from datasets import load_dataset

from tokenizers import ByteLevelBPETokenizer

In [None]:
rob_tok_test = RobertaTokenizerFast.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
rob_tok_test.special_tokens_map

{'bos_token': '<s>',
 'cls_token': '<s>',
 'eos_token': '</s>',
 'mask_token': '<mask>',
 'pad_token': '<pad>',
 'sep_token': '</s>',
 'unk_token': '<unk>'}

### Prepare corpus files
files get saved in gdrive, so this step only needs to be done once

In [None]:
# !unzip corpus.zip -d ./data/

In [None]:
!ls ./data/corpus/ | wc -l

3110


In [None]:
!cat ./data/twitch_lol_combined.txt | wc -l

88973798


In [None]:
# !cat ./data/corpus/*.txt > ./data/twitch_lol_combined.txt

## Tokenizer

### Train the tokenizer (OLD)

Found this one in [tokenizer_training](https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb):
```
tokenizer = Tokenizer(models.BPE())
```

What's the difference to from [how_to_train_a_language_model](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb):
```
tokenizer = ByteLevelBPETokenizer()
```

No need to add a classifier token as it is `<s>`




In [None]:
paths = [str(x) for x in Path(".").glob("data/corpus/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()
# possibility to add pre-tokenizer here
# tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

CPU times: user 30min 19s, sys: 15.5 s, total: 30min 34s
Wall time: 16min


In [None]:
%%time
# Customize training
tokenizer.train(files=paths, vocab_size=30000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [None]:
!mkdir TwitchLeagueBert
tokenizer.save_model("TwitchLeagueBert")

['TwitchLeagueBert/vocab.json', 'TwitchLeagueBert/merges.txt']

## Train a Tokenizer (NEW) from existing tokenizer (using same configuration)

We can simply use an existing Tokenizer, for example for RoBERTa, and retrain it from scratch using the same configuration.

In [None]:
%%time

dataset = load_dataset('text', data_files={'train': "data/twitch_lol_combined.txt"})

Using custom data configuration default-83f6c26edc6ac600


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-83f6c26edc6ac600/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-83f6c26edc6ac600/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 1min 8s, sys: 17.8 s, total: 1min 26s
Wall time: 1min 39s


In [None]:
len(dataset["train"])

88973799

In [None]:
batch_size = 1000
def batch_iterator():
    for i in range(0, len(dataset["train"]), batch_size):
        yield dataset["train"][i : i + batch_size]["text"]

In [None]:
dataset["train"][:10]["text"]

['thx god an english stream exists CoolCat',
 'true!!! <3',
 'Jebaited',
 'Is this on the newt patch? Oh god',
 'yes',
 'Chat is booming',
 'Booming... booming... booming...',
 'they are playing on live patch, right?^^',
 'I doubt it',
 'Probably']

In [None]:
tokenizer_old = RobertaTokenizerFast.from_pretrained("roberta-base")

assert(tokenizer_old.is_fast)

In [None]:
tokenizer_old.tokenize("LUL that was an awesome play Kappa, EleGiggle MorphinTime Poggers poggers PogChamp pogchamp KreyGasm")

['L',
 'UL',
 'Ġthat',
 'Ġwas',
 'Ġan',
 'Ġawesome',
 'Ġplay',
 'ĠKappa',
 ',',
 'ĠEle',
 'G',
 'iggle',
 'ĠMorph',
 'in',
 'Time',
 'ĠPog',
 'gers',
 'Ġp',
 'og',
 'gers',
 'ĠPog',
 'Champ',
 'Ġp',
 'og',
 'ch',
 'amp',
 'ĠK',
 'rey',
 'G',
 'asm']

In [None]:
%%time

tokenizer = tokenizer_old.train_new_from_iterator(batch_iterator(), vocab_size=50000)

CPU times: user 55min 47s, sys: 2min 43s, total: 58min 31s
Wall time: 30min 28s


In [None]:
tokenizer.tokenize("LUL that was an awesome play Kappa, EleGiggle MorphinTime Poggers poggers PogChamp pogchamp KreyGasm")

['LUL',
 'Ġthat',
 'Ġwas',
 'Ġan',
 'Ġawesome',
 'Ġplay',
 'ĠKappa',
 ',',
 'ĠEleGiggle',
 'ĠMorphinTime',
 'ĠPoggers',
 'Ġpoggers',
 'ĠPogChamp',
 'Ġpogchamp',
 'ĠKreyGasm']

In [None]:
tokenizer.vocab_size

50000

In [None]:
tokenizer(["LUL that was an awesome play Kappa, EleGiggle MorphinTime Poggers poggers PogChamp pogchamp KreyGasm"])

{'input_ids': [[0, 213, 573, 812, 714, 12154, 577, 378, 16, 463, 354, 3496, 5794, 290, 15186, 7252, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [None]:
tokenizer.save_pretrained("TwitchLeagueBert")

('TwitchLeagueBert/tokenizer_config.json',
 'TwitchLeagueBert/special_tokens_map.json',
 'TwitchLeagueBert/vocab.json',
 'TwitchLeagueBert/merges.txt',
 'TwitchLeagueBert/added_tokens.json',
 'TwitchLeagueBert/tokenizer.json')

### Instantiate Tokenizer (not needed)

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

train_tokenizer = ByteLevelBPETokenizer(
    "./TwitchLeagueBert/vocab.json",
    "./TwitchLeagueBert/merges.txt",
)

# tokenizer.pre_tokenizer = Whitespace() # should not do this as RobertaTokenizer doesn't do it either => loss of information where words begin and continue

In [None]:
train_tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
train_tokenizer.enable_truncation(max_length=512)

### Look at some tokenization examples

In [None]:
encoded = tokenizer("LUL that was an awesome play Kappa, EleGiggle MorphinTime Poggers poggers PogChamp pogchamp KreyGasm")
encoded

{'input_ids': [0, 213, 573, 812, 714, 12154, 577, 378, 16, 463, 354, 3496, 5794, 290, 15186, 7252, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
print(encoded.tokens())
print(encoded.input_ids)
print(encoded.attention_mask)

['<s>', 'LUL', 'Ġthat', 'Ġwas', 'Ġan', 'Ġawesome', 'Ġplay', 'ĠKappa', ',', 'ĠEleGiggle', 'ĠMorphinTime', 'ĠPoggers', 'Ġpoggers', 'ĠPogChamp', 'Ġpogchamp', 'ĠKreyGasm', '</s>']
[0, 213, 573, 812, 714, 12154, 577, 378, 16, 463, 354, 3496, 5794, 290, 15186, 7252, 2]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
tokenizer.vocab_size

50000

In [None]:
paths

In [None]:
!cat data/*.txt | wc -w

cat: 'data/*.txt': No such file or directory
0


In [None]:
# found some characters used in other languages

for f_path in paths:
  with open(f_path, "r") as in_file:
    text = in_file.read()

    if "廷" in text:
      print(f_path)

data/52706801.txt


## Check GPU availability

In [None]:
# Check that we have a GPU
!nvidia-smi

Mon May 16 08:01:56 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8    28W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
torch.cuda.is_available()

True

## Tokenize and group the dataset

### Load Tokenizer

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained("./TwitchLeagueBert", max_len=512)

### Some examples

In [None]:
tokenizer.encode("LUL WTF EleGiggle")

[0, 213, 1607, 463, 2]

In [None]:
tokenizer.encode("LUL WTF EleGiggle")

[0, 213, 1607, 463, 2]

In [None]:
tokenizer.decode([0, 20085, 20680, 20278, 2])

'<s>EYES wonnedCaptain</s>'

In [None]:
tokenizer.decode([0, 20085, 236, 20680, 236, 20278, 46034, 2])

'<s>EYES C wonned CCaptain cancion</s>'

### Build Dataset

https://www.depends-on-the-definition.com/missing-guide-on-data-preparation-for-language-modeling/


In [None]:
dataset = load_dataset('text', data_files={'train': "data/twitch_lol_combined.txt"})

Using custom data configuration default-83f6c26edc6ac600


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-83f6c26edc6ac600/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
dataset["train"][:5]

{'text': ['thx god an english stream exists CoolCat',
  'true!!! <3',
  'Jebaited',
  'Is this on the newt patch? Oh god',
  'yes']}

In [None]:
def tokenize_function(examples):
    # using return_special_tokens_mask=True for optimized DataCollator later
    return tokenizer(examples["text"], return_special_tokens_mask=True)

In [None]:
# tokenize
dataset_tok = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

      

#0:   0%|          | 0/22244 [00:00<?, ?ba/s]

#1:   0%|          | 0/22244 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/22244 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/22244 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1324 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (674 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1216 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (769 > 512). Running this sequence through the model will result in indexing errors


In [None]:
dataset_tok["train"][:10]

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1]],
 'input_ids': [[0,
   25894,
   236,
   21282,
   236,
   20092,
   236,
   23571,
   236,
   21459,
   236,
   20864,
   26662,
   236,
   22369,
   2],
  [0, 22744, 21169, 236, 33, 24, 2],
  [0, 20642, 2],
  [0,
   21817,
   236,
   20323,
   236,
   20091,
   236,
   20168,
   236,
   21548,
   89,
   236,
   23138,
   36,
   236,
   20950,
   236,
   21282,
   2],
  [0, 21440, 2],
  [0, 21623, 236, 20103, 236, 22262, 26970, 2],
  [0,
   23582,
   26970,
   20675,
   236,
   22262,
   26970,
   20675,
   236,
   22262,
   26970,
   20675,
   2],
  [0,
   20476,
   236,
   20391,
   236,
   21280,
   236,
   20091,
   236,
   22299,
 

In [None]:
# save for later use
dataset_tok.save_to_disk("./data/corpus_tokenized_dataset")

In [None]:
# reload from disk when using later
dataset_tok = datasets.DatasetDict.load_from_disk("./data/corpus_tokenized_dataset")

FileNotFoundError: ignored

In [None]:
# free up some RAM
del dataset

In [None]:

def group_texts(examples, block_size=128):
    """
    :param examples: DatasetDict containing fields with iterables to group
    :param block_size: maximum size of each group in items (tokens)

    :return: Each entry of examples grouped to block_size (number of tokens)
    """
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    # keep remainder to add on later
    remainder = total_length % block_size
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    # add remaining tokens
    if remainder > 0:
        for k in concatenated_examples.keys():
            if k == "input_ids":
                result[k].append(concatenated_examples[k][-remainder:] + ([tokenizer.pad_token_id]* (block_size-remainder)))
            else:  
                result[k].append(concatenated_examples[k][-remainder:] + ([type(concatenated_examples[k][0])()] * (block_size-remainder)))

    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_dataset = dataset_tok.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

NameError: ignored

In [None]:
lm_dataset.column_names

{'train': ['input_ids', 'attention_mask', 'labels']}

In [None]:
tokenizer.decode_batch(lm_dataset["train"][4340:4345]["input_ids"])

['</s><s>Let Khan be the MVP~</s><s>There was such a big draft gap, but C9 still lost</s><s>ARAY KO PUTANG INA ALA ANYARE PATAY KAYONG MGA ADIK KAYO TANGINA PATAY KAYONG MANGA ADIK KAYO TANG INA ANG LAYO NANG TIRA KO AKALA NYO SI SPIDER MAN NOO? TANG INA ',
 "AKO TO SI SPIDIGONG BRRRT BRRRT BRRRR</s><s>it's a bubble for the event</s><s>DK WON GUYS</s><s>?drop</s><s>yo</s><s>w00t</s><s>TEAM DIFF</s><s>Next title for Kkoma</s><s>NA</s><s>TOP GAP YEP</s><s>KEKW C9 FAN OMEGALUL</s><s>exactly</s><s>DFM AYAYA</s><s>ResidentSleeper</s><s>So wheres faker</s><s>TOP AND MACRO GAP KEKW</s><s>Pog</s><s>Where's ",
 'josedeodo? Kappa</s><s>Vedius delusional sad</s><s>COPIUM NEEDED FOR NA</s><s>hey guys what was that music ?</s><s>lol, Tahm counters morg. totally, not like Jungler wants to play the game anyways</s><s>AYAYA DFMWIN</s><s>Nobody cares about wildcard teams ResidentSleeper trash MSI format</s><s>NA GO HOME LUL LUL LUL</s><s>NA real bad</s><s>banzai',
 ' for the emperor</s><s>ResidentSleep

In [None]:
# save grouped dataset for later use
lm_dataset.save_to_disk("./data/corpus_grouped_dataset")

In [None]:
# load saved dataset in case we start from here
lm_dataset = datasets.DatasetDict.load_from_disk("./data/corpus_grouped_dataset")

In [None]:
ds_split = lm_dataset["train"].train_test_split(test_size=0.1)

In [None]:
ds_split

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6713117
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 745902
    })
})

## Train LM

### Data collator for adding `<mask>` tokens

https://huggingface.co/docs/transformers/main_classes/data_collator#transformers.DataCollatorForLanguageModeling

For best performance, this data collator should be used with a dataset having items that are dictionaries or BatchEncoding, with the "special_tokens_mask" key, as returned by a PreTrainedTokenizer or a PreTrainedTokenizerFast with the argument return_special_tokens_mask=True.

https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/language_modeling_from_scratch.ipynb#scrollTo=z6uuUnvz3l_b

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

### Set up model config

In [None]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=50_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [None]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [None]:
model.num_parameters()
# => 84 million parameters

81966416

### Finally, we are all set to initialize our Trainer

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
  output_dir="./TwitchLeagueBert",
  overwrite_output_dir=False,
  num_train_epochs=3,
  per_device_train_batch_size=64,
  save_steps=10_000,
  save_total_limit=4,
  prediction_loss_only=True,
  evaluation_strategy="steps",
  eval_steps=5_000,
  report_to="all"
)

trainer = Trainer(
  model=model,
  args=training_args,
  data_collator=data_collator,
  train_dataset=ds_split['train'],
  eval_dataset=ds_split['test']
)

PyTorch: setting up devices


### Training

In [None]:
%%time
trainer.train()

***** Running training *****
  Num examples = 6713117
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 314679


Step,Training Loss,Validation Loss


KeyboardInterrupt: ignored

### Load from previously saved checkpoint and redo the last steps

In [None]:
trainer.train(resume_from_checkpoint=True)

In [None]:
trainer.save_model()

# Load model and predict something

## Without pipeline

In [None]:
model = RobertaForMaskedLM.from_pretrained("./TwitchLeagueBert/checkpoint-70000/")

In [None]:
def fill_mask_manual(input):
  input = tokenizer.encode(input)
  print(input.ids)
  pred = model(torch.tensor([input.ids]))
  top_pred = torch.argmax(pred.logits, axis=-1)
  print(top_pred[0])
  return tokenizer.decode(top_pred[0].tolist())

In [None]:
fill_mask_manual("LUL this game was <mask>")

[0, 20085, 236, 20323, 236, 20432, 236, 20630, 236, 33, 82, 23051, 35, 2]
tensor([  236, 20085,   236, 20323,   236, 20432,   236, 20630,   236,    33,
           82, 23051,    35,    33])


' LUL this game was <mask><'

In [None]:
tokenizer.encode("<mask>").ids

[0, 33, 82, 23051, 35, 2]

## With pipeline

In [None]:
mask_pipe = FillMaskPipeline(model, tokenizer_roberta)

In [None]:
mask_pipe(["LUL this game was <mask>"])

[{'score': 0.2531238794326782,
  'sequence': 'LUL this game was?',
  'token': 36,
  'token_str': '?'},
 {'score': 0.23930275440216064,
  'sequence': 'LUL this game was.',
  'token': 19,
  'token_str': '.'},
 {'score': 0.06268703937530518,
  'sequence': 'LUL this game was...',
  'token': 20675,
  'token_str': '...'},
 {'score': 0.03718893602490425,
  'sequence': 'LUL this game was!',
  'token': 6,
  'token_str': '!'},
 {'score': 0.026410797610878944,
  'sequence': 'LUL this game was\\',
  'token': 65,
  'token_str': '\\'}]

In [None]:
mask_pipe(["THE BEST IS <mask> AND THE OTHERS"])

[{'score': 0.11161301285028458,
  'sequence': 'THE BEST ISS AND THE OTHERS',
  'token': 56,
  'token_str': 'S'},
 {'score': 0.07581332325935364,
  'sequence': 'THE BEST ISIS AND THE OTHERS',
  'token': 20133,
  'token_str': 'IS'},
 {'score': 0.0732228085398674,
  'sequence': 'THE BEST ISAL AND THE OTHERS',
  'token': 20272,
  'token_str': 'AL'},
 {'score': 0.052002083510160446,
  'sequence': 'THE BEST IS. AND THE OTHERS',
  'token': 19,
  'token_str': '.'},
 {'score': 0.044503841549158096,
  'sequence': 'THE BEST IS, AND THE OTHERS',
  'token': 17,
  'token_str': ','}]

In [None]:
mask_pipe(["NA <mask> EU"], )

[{'score': 0.3670800030231476,
  'sequence': 'NA> EU',
  'token': 35,
  'token_str': '>'},
 {'score': 0.09040877968072891,
  'sequence': 'NA? EU',
  'token': 36,
  'token_str': '?'},
 {'score': 0.06588710099458694,
  'sequence': 'NA= EU',
  'token': 34,
  'token_str': '='},
 {'score': 0.037310414016246796,
  'sequence': 'NA< EU',
  'token': 33,
  'token_str': '<'},
 {'score': 0.03412696346640587,
  'sequence': 'NA, EU',
  'token': 17,
  'token_str': ','}]

In [None]:
mask_pipe(["LUL mods are trying to ban us <mask>"])

[{'score': 0.2190786600112915,
  'sequence': 'LUL mods are trying to ban usual',
  'token': 21850,
  'token_str': 'ual'},
 {'score': 0.13808147609233856,
  'sequence': 'LUL mods are trying to ban us.',
  'token': 19,
  'token_str': '.'},
 {'score': 0.12728427350521088,
  'sequence': 'LUL mods are trying to ban usa',
  'token': 70,
  'token_str': 'a'},
 {'score': 0.07508404552936554,
  'sequence': 'LUL mods are trying to ban us?',
  'token': 36,
  'token_str': '?'},
 {'score': 0.03820619359612465,
  'sequence': 'LUL mods are trying to ban us!',
  'token': 6,
  'token_str': '!'}]