## Setup

### VM setup

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd "/content/gdrive/MyDrive/Master Thesis/Language Model Training"

/content/gdrive/MyDrive/Master Thesis/Language Model Training


In [3]:
!pip install transformers==4.18

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.18
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 57.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 33.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.0-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 52.0 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=27d1eb561231be9f3808715ff3682aa0a25dade3afa

In [4]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 4.7 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 10.5 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 11.9 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 12.3 MB/s 
Installing collected packages: urllib3, xxhash, responses, multiprocess, datasets
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.

use MLM scoring from: https://github.com/awslabs/mlm-scoring
=> version incompatibility

### Imports

In [5]:
from pathlib import Path
import random
import torch

from copy import deepcopy

import transformers
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import RobertaTokenizerFast
from transformers import RobertaForMaskedLM
from transformers import FillMaskPipeline
from transformers import TrainingArguments, Trainer

import datasets
from datasets import load_dataset

from tokenizers import ByteLevelBPETokenizer

import numpy as np

import glob
import json

## Loading components

### Load Data

In [None]:
dataset = load_dataset('text', data_files={'train': "data/twitch_lol_combined.txt"})



Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-83f6c26edc6ac600/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-83f6c26edc6ac600/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# arbitrary split of 10%, same as in training validation, but with seed this time (some examples may overlap)
ds_split = dataset["train"].train_test_split(test_size=0.1, seed=42069)

In [None]:
ds_split

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 80076419
    })
    test: Dataset({
        features: ['text'],
        num_rows: 8897380
    })
})

### Load Tokenizer



In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained("Epidot/TwitchLeagueBert-1000k")

Downloading:   0%|          | 0.00/462 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/429k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.97M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

### Load Model

In [6]:
model = RobertaForMaskedLM.from_pretrained("Epidot/TwitchLeagueBert-1000k")

Downloading:   0%|          | 0.00/636 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/313M [00:00<?, ?B/s]

In [7]:
model.pa

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [8]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [9]:
count_parameters(model)

81966416

### Create DataCollator

In [None]:
data_collator = DataCollatorForLanguageModeling(
  tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

### Load val dataset from disk

In [None]:
ds_val_grouped = datasets.load_from_disk("./data/initial_mlm_test_data/")

### Load Trainer

In [None]:
training_args = TrainingArguments(
        output_dir="./TwitchLeagueBert/evaluation_test",
        overwrite_output_dir=False,
        num_train_epochs=3,
        per_device_train_batch_size=64,
        save_steps=10_000,
        save_total_limit=4,
        prediction_loss_only=True,
        evaluation_strategy="steps",
        eval_steps=5_000,
        report_to="all",
        per_device_eval_batch_size=8
    )

trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=None,
        eval_dataset=ds_val_grouped
    )

In [None]:
ds_val_grouped

Dataset({
    features: ['input_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
    num_rows: 478511
})

In [None]:
ds_val_grouped_sharded = ds_val_grouped.shard(100,index=0)
ds_val_grouped_sharded

Dataset({
    features: ['input_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
    num_rows: 4786
})

### Load Pipeline for mask filling

In [None]:
mask_pipe = FillMaskPipeline(model, tokenizer)

### Tokenize validation set (only run if dataset is not loaded from disk)

In [None]:
def tokenize_function(examples):
  # using return_special_tokens_mask=True for optimized DataCollator later
  return tokenizer(examples["text"], return_special_tokens_mask=True)

def group_texts(examples, tokenizer, block_size=128):
    """
    :param tokenizer:
    :param examples: DatasetDict containing fields with iterables to group
    :param block_size: maximum size of each group in items (tokens)

    :return: Each entry of examples grouped to block_size (number of tokens)
    """
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    # keep remainder to add on later
    remainder = total_length % block_size
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    # add remaining tokens
    if remainder > 0:
        for k in concatenated_examples.keys():
            if k == "input_ids":
                result[k].append(concatenated_examples[k][-remainder:] + ([tokenizer.pad_token_id] * (block_size-remainder)))
            else:
                result[k].append(concatenated_examples[k][-remainder:] + ([type(concatenated_examples[k][0])()] * (block_size-remainder)))

    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
ds_val_tok = ds_split["test"].map(tokenize_function, batched=True, num_proc=2, remove_columns=["text"])

In [None]:
ds_val_grouped = ds_val_tok.map(
        group_texts,
        fn_kwargs={"tokenizer": tokenizer},
        batched=True,
        batch_size=1000,
        num_proc=2
    )

In [None]:
# save dataset
ds_val_grouped.save_to_disk("./data/initial_mlm_test_data")

### Mask filling

In [None]:
def batch_fill_mask(batch):
  # encode batch into input_ids and attention_mask with padding
  batch_encoded = tokenizer.batch_encode_plus(batch, truncation=True, padding=True, max_length=128)
  # transform to a list that the data collator can read and mask
  batch_encoded_list = [dict(zip(batch_encoded, t)) for t in zip(*batch_encoded.values())]
  batch_masked = data_collator(batch_encoded_list)
  # decode again to show masked clear text and feed into mask filling pipeline
  batch_masked_strings = tokenizer.batch_decode(batch_masked["input_ids"])

  best_examples = list()
  for example in batch_masked_strings:
    if not "<mask>" in example:
      # hacky solution which loses some examples
      # for future implement my own masking startegy
      continue
    example = mask_pipe(example[3:-3]) # for stripping <s> and </s>
    best_score_res = list()
    for mask in example:
      if type(mask) is not list:
        mask = [mask]
      best_score_res.append(sorted(mask, key=lambda x: x["score"])[-1])
      best_examples.append(best_score_res)
  
  for orig, masked, predictions in zip(batch, batch_masked_strings, best_examples):
    unmasked = deepcopy(masked)
    for res in predictions:
      unmasked = unmasked.replace("<mask>", res["token_str"], 1)
    yield {"orig": orig,
           "mask": masked,
           "pred": unmasked
           }

## Some example predictions

### Imaginary examples



In [None]:
mask_pipe(["LUL this is <mask>."])

[{'score': 0.10012084990739822,
  'token': 13609,
  'token_str': ' hilarious',
  'sequence': 'LUL this is hilarious.'},
 {'score': 0.07402978837490082,
  'token': 3984,
  'token_str': ' sad',
  'sequence': 'LUL this is sad.'},
 {'score': 0.03149763494729996,
  'token': 10536,
  'token_str': ' awful',
  'sequence': 'LUL this is awful.'},
 {'score': 0.03066454455256462,
  'token': 6456,
  'token_str': ' amazing',
  'sequence': 'LUL this is amazing.'},
 {'score': 0.025679733604192734,
  'token': 4082,
  'token_str': ' boring',
  'sequence': 'LUL this is boring.'}]

In [None]:
mask_pipe(["ResidentSleeper this is <mask>."])

[{'score': 0.35164201259613037,
  'token': 4082,
  'token_str': ' boring',
  'sequence': 'ResidentSleeper this is boring.'},
 {'score': 0.039362192153930664,
  'token': 12835,
  'token_str': ' exciting',
  'sequence': 'ResidentSleeper this is exciting.'},
 {'score': 0.03160205855965614,
  'token': 545,
  'token_str': ' it',
  'sequence': 'ResidentSleeper this is it.'},
 {'score': 0.02950814738869667,
  'token': 23691,
  'token_str': ' intense',
  'sequence': 'ResidentSleeper this is intense.'},
 {'score': 0.024716919288039207,
  'token': 496,
  'token_str': ' so',
  'sequence': 'ResidentSleeper this is so.'}]

In [None]:
mask_pipe(["WutFace this is <mask>."])

[{'score': 0.08646775037050247,
  'token': 10536,
  'token_str': ' awful',
  'sequence': 'WutFace this is awful.'},
 {'score': 0.04463113844394684,
  'token': 11661,
  'token_str': ' horrible',
  'sequence': 'WutFace this is horrible.'},
 {'score': 0.042015694081783295,
  'token': 6456,
  'token_str': ' amazing',
  'sequence': 'WutFace this is amazing.'},
 {'score': 0.039780836552381516,
  'token': 6982,
  'token_str': ' terrible',
  'sequence': 'WutFace this is terrible.'},
 {'score': 0.0376325361430645,
  'token': 10741,
  'token_str': ' cancer',
  'sequence': 'WutFace this is cancer.'}]

In [None]:
mask_pipe(["NotLikeThis this is <mask>."])

[{'score': 0.12479253113269806,
  'token': 3984,
  'token_str': ' sad',
  'sequence': 'NotLikeThis this is sad.'},
 {'score': 0.0938066691160202,
  'token': 11244,
  'token_str': ' embarrassing',
  'sequence': 'NotLikeThis this is embarrassing.'},
 {'score': 0.06633631885051727,
  'token': 28564,
  'token_str': ' painful',
  'sequence': 'NotLikeThis this is painful.'},
 {'score': 0.06352236866950989,
  'token': 10536,
  'token_str': ' awful',
  'sequence': 'NotLikeThis this is awful.'},
 {'score': 0.03548753634095192,
  'token': 21866,
  'token_str': ' ridiculous',
  'sequence': 'NotLikeThis this is ridiculous.'}]

In [None]:
mask_pipe(["THE BEST IS <mask> AND THE OTHERS"])

[{'score': 0.1785276085138321,
  'token': 1099,
  'token_str': ' FAKER',
  'sequence': 'THE BEST IS FAKER AND THE OTHERS'},
 {'score': 0.09717584401369095,
  'token': 3382,
  'token_str': ' UZI',
  'sequence': 'THE BEST IS UZI AND THE OTHERS'},
 {'score': 0.03640839457511902,
  'token': 889,
  'token_str': ' SKT',
  'sequence': 'THE BEST IS SKT AND THE OTHERS'},
 {'score': 0.0194076094776392,
  'token': 20269,
  'token_str': ' OTHERS',
  'sequence': 'THE BEST IS OTHERS AND THE OTHERS'},
 {'score': 0.019195228815078735,
  'token': 1200,
  'token_str': ' LCK',
  'sequence': 'THE BEST IS LCK AND THE OTHERS'}]

In [None]:
mask_pipe(["NA <mask> EU"], )

[{'score': 0.7024577260017395,
  'token': 563,
  'token_str': ' >',
  'sequence': 'NA > EU'},
 {'score': 0.035365454852581024,
  'token': 379,
  'token_str': ' <',
  'sequence': 'NA < EU'},
 {'score': 0.03160358965396881,
  'token': 732,
  'token_str': ' =',
  'sequence': 'NA = EU'},
 {'score': 0.027868052944540977,
  'token': 1089,
  'token_str': ' VS',
  'sequence': 'NA VS EU'},
 {'score': 0.02170223370194435,
  'token': 3552,
  'token_str': ' >>>',
  'sequence': 'NA >>> EU'}]

In [None]:
mask_pipe(["LUL mods are trying to ban us <mask>"])

[{'score': 0.6488573551177979,
  'token': 216,
  'token_str': ' LUL',
  'sequence': 'LUL mods are trying to ban us LUL'},
 {'score': 0.14632722735404968,
  'token': 997,
  'token_str': ' all',
  'sequence': 'LUL mods are trying to ban us all'},
 {'score': 0.018216745927929878,
  'token': 1006,
  'token_str': ' now',
  'sequence': 'LUL mods are trying to ban us now'},
 {'score': 0.015547475777566433,
  'token': 2285,
  'token_str': ' mods',
  'sequence': 'LUL mods are trying to ban us mods'},
 {'score': 0.013928733766078949,
  'token': 1085,
  'token_str': ' again',
  'sequence': 'LUL mods are trying to ban us again'}]

### Examples from training dataset

In [None]:
sample_range = 10


batch = list()
batch_inds = list()
for i in range(10):
  random.seed(i*22)
  sample_ind = random.randint(0, dataset["train"].num_rows)
  sample_msgs = dataset["train"][sample_ind-sample_range:sample_ind+sample_range]["text"]

  batch.append(f"{tokenizer.eos_token}{tokenizer.bos_token}".join(sample_msgs))
  batch_inds.append(sample_ind)  

In [None]:
batch_encoded = tokenizer.batch_encode_plus(batch, truncation=True, padding=True, max_length=128)

In [None]:
batch_encoded

In [None]:
batch_encoded_list = [dict(zip(batch_encoded, t)) for t in zip(*batch_encoded.values())]

In [None]:
batch_masked = data_collator(batch_encoded_list)

In [None]:
batch_masked

{'input_ids': tensor([[    0,  1056,     2,  ...,  1946,  2409,     2],
        [    0,   750,     2,  ...,     1,     1,     1],
        [    0,     4,   462,  ...,  2569,   758,     2],
        ...,
        [    0, 35702,  2024,  ...,     1,     1,     1],
        [    0,   467,     2,  ...,     1,     1,     1],
        [    0,  2056,   368,  ...,  4356,     2,     2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, 3461, -100,  ..., -100, -100, -100],
        ...,
        [-100, -100, 2024,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100]])}

In [None]:
batch_masked_strings = tokenizer.batch_decode(batch_masked["input_ids"])

In [None]:
fill_mask_results = list()
for example in batch_masked_strings:
  fill_mask_results.append(mask_pipe(example[3:-3])) # for stripping <s> and </s>

In [None]:
best_examples = list()
for example in fill_mask_results:
  best_score_res = list()
  for mask in example:
    best_score_res.append(sorted(mask, key=lambda x: x["score"])[-2])

  best_examples.append(best_score_res)

In [None]:
for orig, masked, predictions in zip(batch, batch_masked_strings, best_examples):
  unmasked = deepcopy(masked)
  for res in predictions:
    unmasked = unmasked.replace("<mask>", res["token_str"], 1)
  print("mask: " + masked)
  print("orig: <s>" + orig + "</s>")
  print("pred: " + unmasked)
  print()

mask: <s>OMEGALUL</s><s>TwitchUnity STRAIGHT PRIDE<mask> STRAIGHT PRIDE TwitchUnity STRAIGHT PRIDE TwitchUnity STRAIGHT PRIDE TwitchUnity '</s><s>LULW</s><s>mangoFalco WOMBO COMBO</s><s>SHOW HEALING AND SHIELD LUL</s><s>better<mask></s><s>Akali EleGiggle</s><s>24K DAMAGE DIFFERENCE MID LUL</s><s>axa<mask> LUL</s><s>they took 36 MINUTES to beat EF OMEGALUL</s><s>AZIR DMG monkaW AKALI DMG<mask></s><s>OMEGALUL</s><s>Hey mods,<mask> you really<mask> that<mask> mode will affect on<mask>. I will<mask> you know that I have multiple accounts in this<mask> right now. If<mask> don’t<mask><mask>, I will copy and<mask> this on my other accounts</s>
orig: <s>OMEGALUL</s><s>TwitchUnity STRAIGHT PRIDE TwitchUnity STRAIGHT PRIDE TwitchUnity STRAIGHT PRIDE TwitchUnity STRAIGHT PRIDE TwitchUnity '</s><s>LULW</s><s>mangoFalco WOMBO COMBO</s><s>SHOW HEALING AND SHIELD LUL</s><s>better mid</s><s>Akali EleGiggle</s><s>24K DAMAGE DIFFERENCE MID LUL</s><s>Smartz LUL</s><s>they took 36 MINUTES to beat EF OMEGA

In [None]:
best_examples

In [None]:
mask_pipe("why is this Malphite on meta again <mask>")

[{'score': 0.4972342848777771,
  'token': 35,
  'token_str': '?',
  'sequence': 'why is this Malphite on meta again?'},
 {'score': 0.0628303810954094,
  'token': 341,
  'token_str': ' ResidentSleeper',
  'sequence': 'why is this Malphite on meta again ResidentSleeper'},
 {'score': 0.04635553061962128,
  'token': 216,
  'token_str': ' LUL',
  'sequence': 'why is this Malphite on meta again LUL'},
 {'score': 0.04054735228419304,
  'token': 656,
  'token_str': ' NotLikeThis',
  'sequence': 'why is this Malphite on meta again NotLikeThis'},
 {'score': 0.038534972816705704,
  'token': 523,
  'token_str': '???',
  'sequence': 'why is this Malphite on meta again???'}]

In [None]:
mask_pipe("")

In [None]:
# it has not learned what copy pasta is, otherwise it could have predicted COMA
# 249134, KKOMA PUTTING ME IN A COMA ResidentSleeper KKOMA PUTTING ME IN A COMA ResidentSleeper
mask_pipe("KKOMA PUTTING ME IN A COMA ResidentSleeper KKOMA PUTTING ME IN A <mask> ResidentSleeper")

### Examples from unseen data

In [None]:
val_split_raw = glob.glob("/content/gdrive/MyDrive/nalcs_raw_data_fu/Copy of nalcs_nalcs1_w[268]d[1]_*.json")

In [None]:
def is_msg(msg):
  return "attributes" in msg.keys()

In [None]:
msgs = list()
for day_raw in val_split_raw:
  with open(day_raw, "r") as in_file:
    messages = json.load(in_file)
    msgs.extend([msg["attributes"]["message"] for msg in messages if is_msg(msg)])

In [None]:
len(msgs)

193585

In [None]:
chunk_size = 10
batch_size = 25

chunks = list()
for i in range(int(len(msgs)/chunk_size)):
  chunk = msgs[i*chunk_size:min((i+1)*chunk_size, len(msgs))]
  chunks.append(f"{tokenizer.eos_token}{tokenizer.bos_token}".join(chunk))
  if len(chunks) == batch_size:
    break

In [None]:
len(chunks)

25

In [None]:
batch_encoded_highlights = tokenizer.batch_encode_plus(chunks, truncation=True, padding=True, max_length=128)

In [None]:
batch_encoded_highlights

In [None]:
batch_encoded_highlights_list = [dict(zip(batch_encoded_highlights, t)) for t in zip(*batch_encoded_highlights.values())]

In [None]:
batch_masked_highlights = data_collator(batch_encoded_highlights_list)

In [None]:
batch_masked_highlights

{'input_ids': tensor([[    0,   626,     2,  ...,     1,     1,     1],
        [    0,  2145,     2,  ...,     1,     1,     1],
        [    0,    38,    29,  ...,     1,     1,     1],
        ...,
        [    0,    54, 24945,  ...,     1,     1,     1],
        [    0,    70,    29,  ...,     1,     1,     1],
        [    0, 20458, 10928,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        ...,
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100]])}

In [None]:
batch_masked_highlights_strings = tokenizer.batch_decode(batch_masked_highlights["input_ids"])

In [None]:
fill_mask_highlights_results = list()
for example in batch_masked_highlights_strings:
  fill_mask_highlights_results.append(mask_pipe(example[3:-3])) # for stripping <s> and </s>

In [None]:
fill_mask_highlights_results[0]

In [None]:
best_examples_highlights = list()
for example in fill_mask_highlights_results:
  best_score_highlights_res = list()
  try:
    for mask in example:
      best_score_highlights_res.append(sorted(mask, key=lambda x: x["score"])[-2])
  except TypeError:
      best_score_highlights_res.append(sorted(example, key=lambda x: x["score"])[-2])

  best_examples_highlights.append(best_score_highlights_res)

In [None]:
for orig, masked, predictions in zip(chunks, batch_masked_highlights_strings, best_examples_highlights):
  unmasked = deepcopy(masked)
  for res in predictions:
    unmasked = unmasked.replace("<mask>", res["token_str"], 1)
  print("mask: " + masked)
  print("orig: <s>" + orig + "</s>")
  print("pred: " + unmasked)
  print()

mask: <s>WutFace</s><s>PogChamp</s><s>PogChamp</s><s>PogChamp</s><s>Lemon vs Aphro Keepo</s><s>ORIG Kreygasm</s><s>LUL</s><s>Here we go</s><s><mask></s><s></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
orig: <s>WutFace</s><s>PogChamp</s><s>PogChamp</s><s>PogChamp</s><s>Lemon vs Aphro Keepo</s><s>Aphromoo Kreygasm</s><s>LUL</s><s>Here we go</s><s>OMGScoots</s><s></s>
pred: <s>WutFace</s><s>PogChamp</s><s>PogChamp</s><s>PogChamp</s><s>Lemon vs Aphro Keepo</s><s>ORIG Kreygasm</s><s>LUL</s><s>Here we go</s><s>WutFace</s><s></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pa

## Perplexity

it may not be the right metric for measuring MLMs, but will give a hint at how well it is doing

It is simply calculated by raising e to the power of the eval loss.

$e^{validation\_loss}$

In [None]:
eval_res = trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4786
  Batch size = 8


In [None]:
eval_res

{'eval_loss': 2.8871724605560303,
 'eval_runtime': 62.0081,
 'eval_samples_per_second': 77.183,
 'eval_steps_per_second': 9.66}

In [None]:
np.exp(eval_res["eval_loss"])

17.942504669248084

In [None]:
np.exp(2.5619537830352783)

12.961115801790676

### Preprocess Fu et al. 2017 data

In [None]:
ds_fu_raw = load_dataset('text', data_files={'train': "data/fu_raw_msgs_combined_cleaned.txt"})

Using custom data configuration default-13dccea9f5f00974


Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-13dccea9f5f00974/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-13dccea9f5f00974/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
ds_fu_raw_tok = ds_fu_raw["train"].map(tokenize_function, batched=True, num_proc=2, remove_columns=["text"])

   

#0:   0%|          | 0/981 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/981 [00:00<?, ?ba/s]

In [None]:
ds_fu_raw_grouped = ds_fu_raw_tok.map(
        group_texts,
        fn_kwargs={"tokenizer": tokenizer},
        batched=True,
        batch_size=1000,
        num_proc=2
    )

   

#0:   0%|          | 0/981 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/981 [00:00<?, ?ba/s]

In [None]:
ds_fu_raw_grouped.save_to_disk("./data/fu_raw_TwitchLeagueBERT")