## Setup

### VM setup

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
%cd "/content/gdrive/MyDrive/Master Thesis/Language Model Training"

/content/gdrive/MyDrive/Master Thesis/Language Model Training


In [None]:
!pip install transformers==4.18

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


use MLM scoring from: https://github.com/awslabs/mlm-scoring
=> version incompatibility

### Imports

In [None]:
from pathlib import Path
import random
import torch

from copy import deepcopy

import transformers
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import RobertaTokenizerFast
from transformers import RobertaForMaskedLM
from transformers import FillMaskPipeline
from transformers import TrainingArguments, Trainer

import datasets
from datasets import load_dataset

from tokenizers import ByteLevelBPETokenizer

import numpy as np

import glob
import json

## Loading components

### Load Data

In [None]:
dataset = load_dataset('text', data_files={'train': "data/twitch_lol_combined.txt"})



Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-83f6c26edc6ac600/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-83f6c26edc6ac600/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# arbitrary split of 10%, same as in training validation, but  seed this time (some examples may overlap)
ds_split = dataset["train"].train_test_split(test_size=0.1, seed=42069)

In [None]:
ds_split

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 80076419
    })
    test: Dataset({
        features: ['text'],
        num_rows: 8897380
    })
})

### Load Tokenizer



In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

### Load Model

In [None]:
model = RobertaForMaskedLM.from_pretrained("roberta-base")

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

### Create DataCollator

In [None]:
data_collator = DataCollatorForLanguageModeling(
  tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

### Load val dataset from disk

In [None]:
ds_val_grouped = datasets.load_from_disk("./data/initial_mlm_test_data_roberta/")

In [None]:
ds_val_grouped_sharded = ds_val_grouped.shard(100,index=0)
ds_val_grouped_sharded

Dataset({
    features: ['input_ids', 'attention_mask', 'special_tokens_mask', 'labels'],
    num_rows: 7205
})

### Load Pipeline for mask filling

In [None]:
mask_pipe = FillMaskPipeline(model, tokenizer)

### Tokenize validation set

In [None]:
def tokenize_function(examples):
  # using return_special_tokens_mask=True for optimized DataCollator later
  return tokenizer(examples["text"], return_special_tokens_mask=True)

def group_texts(examples, tokenizer, block_size=128):
    """
    :param tokenizer:
    :param examples: DatasetDict containing fields with iterables to group
    :param block_size: maximum size of each group in items (tokens)

    :return: Each entry of examples grouped to block_size (number of tokens)
    """
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    # keep remainder to add on later
    remainder = total_length % block_size
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    # add remaining tokens
    if remainder > 0:
        for k in concatenated_examples.keys():
            if k == "input_ids":
                result[k].append(concatenated_examples[k][-remainder:] + ([tokenizer.pad_token_id] * (block_size-remainder)))
            else:
                result[k].append(concatenated_examples[k][-remainder:] + ([type(concatenated_examples[k][0])()] * (block_size-remainder)))

    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
ds_val_tok = ds_split["test"].map(tokenize_function, batched=True, num_proc=2, remove_columns=["text"])

    

#1:   0%|          | 0/4449 [00:00<?, ?ba/s]

#0:   0%|          | 0/4449 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors


In [None]:
ds_val_grouped = ds_val_tok.map(
        group_texts,
        fn_kwargs={"tokenizer": tokenizer},
        batched=True,
        batch_size=1000,
        num_proc=2
    )

   

#0:   0%|          | 0/4449 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/4449 [00:00<?, ?ba/s]

In [None]:
# save dataset
ds_val_grouped.save_to_disk("./data/initial_mlm_test_data_roberta")

### Mask filling

In [None]:
def batch_fill_mask(batch):
  # encode batch into input_ids and attention_mask with padding
  batch_encoded = tokenizer.batch_encode_plus(batch, truncation=True, padding=True, max_length=128)
  # transform to a list that the data collator can read and mask
  batch_encoded_list = [dict(zip(batch_encoded, t)) for t in zip(*batch_encoded.values())]
  batch_masked = data_collator(batch_encoded_list)
  # decode again to show masked clear text and feed into mask filling pipeline
  batch_masked_strings = tokenizer.batch_decode(batch_masked["input_ids"])

  best_examples = list()
  for example in batch_masked_strings:
    if not "<mask>" in example:
      # hacky solution which loses some examples
      # for future implement my own masking startegy
      continue
    example = mask_pipe(example[3:-3]) # for stripping <s> and </s>
    best_score_res = list()
    for mask in example:
      if type(mask) is not list:
        mask = [mask]
      best_score_res.append(sorted(mask, key=lambda x: x["score"])[-1])
      best_examples.append(best_score_res)
  
  for orig, masked, predictions in zip(batch, batch_masked_strings, best_examples):
    unmasked = deepcopy(masked)
    for res in predictions:
      unmasked = unmasked.replace("<mask>", res["token_str"], 1)
    yield {"orig": orig,
           "mask": masked,
           "pred": unmasked
           }

## Some example predictions

### Imaginary examples



In [None]:
mask_pipe(["LUL this is <mask>."])awesome, interesting, great, cool, bad

[{'score': 0.06589065492153168,
  'token': 6344,
  'token_str': ' awesome',
  'sequence': 'LUL this is awesome.'},
 {'score': 0.04585752636194229,
  'token': 2679,
  'token_str': ' interesting',
  'sequence': 'LUL this is interesting.'},
 {'score': 0.04143474996089935,
  'token': 372,
  'token_str': ' great',
  'sequence': 'LUL this is great.'},
 {'score': 0.03511602059006691,
  'token': 3035,
  'token_str': ' cool',
  'sequence': 'LUL this is cool.'},
 {'score': 0.02795984223484993,
  'token': 1099,
  'token_str': ' bad',
  'sequence': 'LUL this is bad.'}]

In [None]:
mask_pipe(["ResidentSleeper this is <mask>."])

[{'score': 0.02327459119260311,
  'token': 45,
  'token_str': ' not',
  'sequence': 'ResidentSleeper this is not.'},
 {'score': 0.020932184532284737,
  'token': 162,
  'token_str': ' me',
  'sequence': 'ResidentSleeper this is me.'},
 {'score': 0.01878264546394348,
  'token': 357,
  'token_str': ' better',
  'sequence': 'ResidentSleeper this is better.'},
 {'score': 0.018267618492245674,
  'token': 205,
  'token_str': ' good',
  'sequence': 'ResidentSleeper this is good.'},
 {'score': 0.01669934019446373,
  'token': 13,
  'token_str': ' for',
  'sequence': 'ResidentSleeper this is for.'}]

In [None]:
mask_pipe(["WutFace this is <mask>."])

[{'score': 0.09738326072692871,
  'token': 6344,
  'token_str': ' awesome',
  'sequence': 'WutFace this is awesome.'},
 {'score': 0.051141783595085144,
  'token': 372,
  'token_str': ' great',
  'sequence': 'WutFace this is great.'},
 {'score': 0.033073633909225464,
  'token': 162,
  'token_str': ' me',
  'sequence': 'WutFace this is me.'},
 {'score': 0.03268173709511757,
  'token': 2679,
  'token_str': ' interesting',
  'sequence': 'WutFace this is interesting.'},
 {'score': 0.031207386404275894,
  'token': 14598,
  'token_str': ' hilarious',
  'sequence': 'WutFace this is hilarious.'}]

In [None]:
mask_pipe(["NotLikeThis this is <mask>."])

[{'score': 0.23756493628025055,
  'token': 940,
  'token_str': ' private',
  'sequence': 'NotLikeThis this is private.'},
 {'score': 0.13200481235980988,
  'token': 14073,
  'token_str': ' experimental',
  'sequence': 'NotLikeThis this is experimental.'},
 {'score': 0.10492823272943497,
  'token': 18434,
  'token_str': ' JavaScript',
  'sequence': 'NotLikeThis this is JavaScript.'},
 {'score': 0.056252479553222656,
  'token': 1081,
  'token_str': ' personal',
  'sequence': 'NotLikeThis this is personal.'},
 {'score': 0.04139917716383934,
  'token': 10813,
  'token_str': ' interactive',
  'sequence': 'NotLikeThis this is interactive.'}]

In [None]:
mask_pipe(["THE BEST IS <mask> AND THE OTHERS"])

[{'score': 0.7594372034072876,
  'token': 3779,
  'token_str': ' IT',
  'sequence': 'THE BEST IS IT AND THE OTHERS'},
 {'score': 0.02422250807285309,
  'token': 23435,
  'token_str': ' BEST',
  'sequence': 'THE BEST IS BEST AND THE OTHERS'},
 {'score': 0.022127950564026833,
  'token': 382,
  'token_str': ' US',
  'sequence': 'THE BEST IS US AND THE OTHERS'},
 {'score': 0.017808886244893074,
  'token': 10652,
  'token_str': ' THIS',
  'sequence': 'THE BEST IS THIS AND THE OTHERS'},
 {'score': 0.016839774325489998,
  'token': 9443,
  'token_str': ' HERE',
  'sequence': 'THE BEST IS HERE AND THE OTHERS'}]

In [None]:
mask_pipe(["NA <mask> EU"], )

[{'score': 0.44674187898635864,
  'token': 846,
  'token_str': 'V',
  'sequence': 'NAV EU'},
 {'score': 0.11876197904348373,
  'token': 33520,
  'token_str': 'WAY',
  'sequence': 'NAWAY EU'},
 {'score': 0.06054440885782242,
  'token': 30876,
  'token_str': 'BLE',
  'sequence': 'NABLE EU'},
 {'score': 0.05484357103705406,
  'token': 387,
  'token_str': 'B',
  'sequence': 'NAB EU'},
 {'score': 0.032075632363557816,
  'token': 771,
  'token_str': 'W',
  'sequence': 'NAW EU'}]

In [None]:
mask_pipe(["LUL mods are trying to ban us <mask>"])

[{'score': 0.38371285796165466,
  'token': 4,
  'token_str': '.',
  'sequence': 'LUL mods are trying to ban us.'},
 {'score': 0.2154664397239685,
  'token': 328,
  'token_str': '!',
  'sequence': 'LUL mods are trying to ban us!'},
 {'score': 0.1222674772143364,
  'token': 35,
  'token_str': ':',
  'sequence': 'LUL mods are trying to ban us:'},
 {'score': 0.053981341421604156,
  'token': 122,
  'token_str': ' now',
  'sequence': 'LUL mods are trying to ban us now'},
 {'score': 0.022417498752474785,
  'token': 6000,
  'token_str': ' forever',
  'sequence': 'LUL mods are trying to ban us forever'}]

### Examples from training dataset

In [None]:
sample_range = 10


batch = list()
batch_inds = list()
for i in range(10):
  random.seed(i*22)
  sample_ind = random.randint(0, dataset["train"].num_rows)
  sample_msgs = dataset["train"][sample_ind-sample_range:sample_ind+sample_range]["text"]

  batch.append(f"{tokenizer.eos_token}{tokenizer.bos_token}".join(sample_msgs))
  batch_inds.append(sample_ind)

In [None]:
batch_encoded = tokenizer.batch_encode_plus(batch, truncation=True, padding=True, max_length=128)

In [None]:
batch_encoded

In [None]:
batch_encoded_list = [dict(zip(batch_encoded, t)) for t in zip(*batch_encoded.values())]

In [None]:
batch_masked = data_collator(batch_encoded_list)

In [None]:
batch_masked

In [None]:
batch_masked_strings = tokenizer.batch_decode(batch_masked["input_ids"])

In [None]:
fill_mask_results = list()
for example in batch_masked_strings:
  fill_mask_results.append(mask_pipe(example[3:-4])) # for stripping <s> and </s>

In [None]:
best_examples = list()
for example in fill_mask_results:
  best_score_res = list()
  for mask in example:
    best_score_res.append(sorted(mask, key=lambda x: x["score"])[-2])

  best_examples.append(best_score_res)

In [None]:
for orig, masked, predictions in zip(batch, batch_masked_strings, best_examples):
  unmasked = deepcopy(masked)
  for res in predictions:
    unmasked = unmasked.replace("<mask>", res["token_str"], 1)
  print("mask: " + masked)
  print("orig: <s>" + orig + "</s>")
  print("pred: " + unmasked)
  print()

mask: <s>OMEGALUL</s><s>TwitchUnity STRAIGHT PRIDE<mask>Unity STRAIGHT PRIDE TwitchUnity STRAIGHT PRIDE TwitchUnity STRAIGHT PRIDE TwitchUnity '</s><s><mask>ULW</s><s>mango<mask>co WOMBO COMBO</s><s>SHOW HEALING<mask> SHIELD LUL</s><s><mask> mid</s><s>Akali EleGiggle</s><s>24K DAMAGE D<mask><mask><mask><mask> LUL</s><s>Sm<mask> LUL</s><s>they took 36 MINUTES<mask> beat<mask> OMEG<mask>UL</s><s>AZIR DMG monkaW AKALI DM Minor LUL</s>
orig: <s>OMEGALUL</s><s>TwitchUnity STRAIGHT PRIDE TwitchUnity STRAIGHT PRIDE TwitchUnity STRAIGHT PRIDE TwitchUnity STRAIGHT PRIDE TwitchUnity '</s><s>LULW</s><s>mangoFalco WOMBO COMBO</s><s>SHOW HEALING AND SHIELD LUL</s><s>better mid</s><s>Akali EleGiggle</s><s>24K DAMAGE DIFFERENCE MID LUL</s><s>Smartz LUL</s><s>they took 36 MINUTES to beat EF OMEGALUL</s><s>AZIR DMG monkaW AKALI DMG LULW</s><s>OMEGALUL</s><s>Hey mods, did you really think that slow mode will affect on me. I will let you know that I have multiple accounts in this chat right now. If you d

In [None]:
best_examples

In [None]:
mask_pipe("why is this Malphite on meta again <mask>")

[{'score': 0.9025630950927734,
  'token': 116,
  'token_str': '?',
  'sequence': 'why is this Malphite on meta again?'},
 {'score': 0.03233589977025986,
  'token': 17487,
  'token_str': '?',
  'sequence': 'why is this Malphite on meta again?'},
 {'score': 0.01631227135658264,
  'token': 4,
  'token_str': '.',
  'sequence': 'why is this Malphite on meta again.'},
 {'score': 0.006503658834844828,
  'token': 38713,
  'token_str': '???',
  'sequence': 'why is this Malphite on meta again???'},
 {'score': 0.005908921826630831,
  'token': 28749,
  'token_str': '??',
  'sequence': 'why is this Malphite on meta again??'}]

In [None]:
mask_pipe("")

In [None]:
# it has not learned what copy pasta is, otherwise it could have predicted COMA
# 249134, KKOMA PUTTING ME IN A COMA ResidentSleeper KKOMA PUTTING ME IN A COMA ResidentSleeper
mask_pipe("KKOMA PUTTING ME IN A COMA ResidentSleeper KKOMA PUTTING ME IN A <mask> ResidentSleeper")

[{'score': 0.5533647537231445,
  'token': 15823,
  'token_str': ' COM',
  'sequence': 'KKOMA PUTTING ME IN A COMA ResidentSleeper KKOMA PUTTING ME IN A COM ResidentSleeper'},
 {'score': 0.20228534936904907,
  'token': 5267,
  'token_str': ' CA',
  'sequence': 'KKOMA PUTTING ME IN A COMA ResidentSleeper KKOMA PUTTING ME IN A CA ResidentSleeper'},
 {'score': 0.03358528017997742,
  'token': 6247,
  'token_str': ' CO',
  'sequence': 'KKOMA PUTTING ME IN A COMA ResidentSleeper KKOMA PUTTING ME IN A CO ResidentSleeper'},
 {'score': 0.015576242469251156,
  'token': 8193,
  'token_str': ' Commonwealth',
  'sequence': 'KKOMA PUTTING ME IN A COMA ResidentSleeper KKOMA PUTTING ME IN A Commonwealth ResidentSleeper'},
 {'score': 0.013505257666110992,
  'token': 17088,
  'token_str': ' Guatemala',
  'sequence': 'KKOMA PUTTING ME IN A COMA ResidentSleeper KKOMA PUTTING ME IN A Guatemala ResidentSleeper'}]

### Examples from unseen data

In [None]:
val_split_raw = glob.glob("/content/gdrive/MyDrive/nalcs_raw_data_fu/Copy of nalcs_nalcs1_w[268]d[1]_*.json")

In [None]:
def is_msg(msg):
  return "attributes" in msg.keys()

In [None]:
msgs = list()
for day_raw in val_split_raw:
  with open(day_raw, "r") as in_file:
    messages = json.load(in_file)
    msgs.extend([msg["attributes"]["message"] for msg in messages if is_msg(msg)])

In [None]:
len(msgs)

193585

In [None]:
chunk_size = 10
batch_size = 25

chunks = list()
for i in range(int(len(msgs)/chunk_size)):
  chunk = msgs[i*chunk_size:min((i+1)*chunk_size, len(msgs))]
  chunks.append(f"{tokenizer.eos_token}{tokenizer.bos_token}".join(chunk))
  if len(chunks) == batch_size:
    break

In [None]:
len(chunks)

25

In [None]:
batch_encoded_highlights = tokenizer.batch_encode_plus(chunks, truncation=True, padding=True, max_length=128)

In [None]:
batch_encoded_highlights

In [None]:
batch_encoded_highlights_list = [dict(zip(batch_encoded_highlights, t)) for t in zip(*batch_encoded_highlights.values())]

In [None]:
batch_masked_highlights = data_collator(batch_encoded_highlights_list)

In [None]:
batch_masked_highlights

In [None]:
batch_masked_highlights_strings = tokenizer.batch_decode(batch_masked_highlights["input_ids"])

In [None]:
fill_mask_highlights_results = list()
for example in batch_masked_highlights_strings:
  fill_mask_highlights_results.append(mask_pipe(example[3:-3])) # for stripping <s> and </s>

In [None]:
fill_mask_highlights_results[0]

In [None]:
best_examples_highlights = list()
for example in fill_mask_highlights_results:
  best_score_highlights_res = list()
  for mask in example:
    best_score_highlights_res.append(sorted(mask, key=lambda x: x["score"])[-2])

  best_examples_highlights.append(best_score_highlights_res)

In [None]:
for orig, masked, predictions in zip(chunks, batch_masked_highlights_strings, best_examples_highlights):
  unmasked = deepcopy(masked)
  for res in predictions:
    unmasked = unmasked.replace("<mask>", res["token_str"], 1)
  print("mask: " + masked)
  print("orig: <s>" + orig + "</s>")
  print("pred: " + unmasked)
  print()

mask: <s>Wut<mask></s><s><mask>ogChamp</s><s>PogChamp</s><s>Pog<mask></s><s>Lemon vs Aphro Keepo</s><s><mask>phrom<mask><mask>reygasm</s><s>LUL</s><s>Here we<mask></s><s>OMGScoots</s><s></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
orig: <s>WutFace</s><s>PogChamp</s><s>PogChamp</s><s>PogChamp</s><s>Lemon vs Aphro Keepo</s><s>Aphromoo Kreygasm</s><s>LUL</s><s>Here we go</s><s>OMGScoots</s><s></s>
pred: <s>Wutcher</s><s>PogChamp</s><s>PogChamp</s><s>Poglia</s><s>Lemon vs Aphro Keepo</s><s>Aphrom & threygasm</s><s>LUL</s><s>Here we were</s><s>OMGScoots</s><s></s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

In [33]:
"<s>yes</s><s>Kreygasm such music</s><s>Lemon is ready</s><s>HeyGuys</s><s>Aphromoo PogChamp</s><s>4Head 4Head 4Head 4Head 4Head</s><s>Aphro laughing at lemon lul</s><s>This is the kind of music you listen to white high</s><s>LemonNation PogChamp</s><s>APHRO POINTING OUT HIS NEXT BOOTY CALL, SHOTS FIRED</s>"

'<s>yes</s><s>Kreygasm such music</s><s>Lemon is ready</s><s>HeyGuys</s><s>Aphromoo PogChamp</s><s>4Head 4Head 4Head 4Head 4Head</s><s>Aphro laughing at lemon lul</s><s>This is the kind of music you listen to white high</s><s>LemonNation PogChamp</s><s>APHRO POINTING OUT HIS NEXT BOOTY CALL, SHOTS FIRED</s>'

In [31]:
mask_pipe("<s>yes</s><s>Kreygasm such music</s><s>Lemon is ready</s><s>HeyGuys</s><s>Aphromoo PogChamp</s><s>4Head 4Head 4Head 4Head 4Head</s><s>Aphro laughing at lemon lul</s><s>This is the kind of music you listen to white high</s><s>LemonNation PogChamp</s><s>APHRO POINTING OUT HIS NEXT BOOTY CALL, SHOTS FIRED</s>")

[{'score': 0.1275135576725006,
  'token': 8,
  'token_str': ' and',
  'sequence': 'yes and such musicLemon is readyHeyGuysAphromoo PogChamp4Head 4Head 4Head 4Head 4HeadAphro laughing at lemon lulThis is the kind of music you listen to white highLemonNation PogChampAPHRO POINTING OUT HIS NEXT BOOTY CALL, SHOTS FIRED'},
 {'score': 0.042079195380210876,
  'token': 18,
  'token_str': "'s",
  'sequence': "yes's such musicLemon is readyHeyGuysAphromoo PogChamp4Head 4Head 4Head 4Head 4HeadAphro laughing at lemon lulThis is the kind of music you listen to white highLemonNation PogChampAPHRO POINTING OUT HIS NEXT BOOTY CALL, SHOTS FIRED"},
 {'score': 0.032024286687374115,
  'token': 16,
  'token_str': ' is',
  'sequence': 'yes is such musicLemon is readyHeyGuysAphromoo PogChamp4Head 4Head 4Head 4Head 4HeadAphro laughing at lemon lulThis is the kind of music you listen to white highLemonNation PogChampAPHRO POINTING OUT HIS NEXT BOOTY CALL, SHOTS FIRED'},
 {'score': 0.0292093213647604,
  'token'

In [35]:
mask_pipe("<s>yes</s><s>Kreygasm such music</s><s>Lemon is ready</s><s>HeyGuys</s><s>Aphromoo PogChamp</s><s>4Head 4Head 4Head <mask><mask> 4Head</s><s>Aphro laughing at lemon lul</s><s>This is the kind of music you listen to white high</s><s>LemonNation PogChamp</s><s>APHRO POINTING OUT HIS NEXT BOOTY CALL, SHOTS FIRED</s>")

[[{'score': 0.6749449968338013,
   'token': 204,
   'token_str': ' 4',
   'sequence': '<s><s>yes</s><s>Kreygasm such music</s><s>Lemon is ready</s><s>HeyGuys</s><s>Aphromoo PogChamp</s><s>4Head 4Head 4Head 4<mask> 4Head</s><s>Aphro laughing at lemon lul</s><s>This is the kind of music you listen to white high</s><s>LemonNation PogChamp</s><s>APHRO POINTING OUT HIS NEXT BOOTY CALL, SHOTS FIRED</s></s>'},
  {'score': 0.07971546798944473,
   'token': 306,
   'token_str': '4',
   'sequence': '<s><s>yes</s><s>Kreygasm such music</s><s>Lemon is ready</s><s>HeyGuys</s><s>Aphromoo PogChamp</s><s>4Head 4Head 4Head4<mask> 4Head</s><s>Aphro laughing at lemon lul</s><s>This is the kind of music you listen to white high</s><s>LemonNation PogChamp</s><s>APHRO POINTING OUT HIS NEXT BOOTY CALL, SHOTS FIRED</s></s>'},
  {'score': 0.05696845054626465,
   'token': 2,
   'token_str': '</s>',
   'sequence': '<s><s>yes</s><s>Kreygasm such music</s><s>Lemon is ready</s><s>HeyGuys</s><s>Aphromoo PogChamp</s><

### Perplexity

it may not be the right metric for measuring MLMs, but will give a hint at how well it is doing

It is simply calculated by raising e to the power of the eval loss.

$e^{validation\_loss}$

In [None]:
training_args = TrainingArguments(
        output_dir="./RoBERTa/evaluation_test",
        overwrite_output_dir=False,
        num_train_epochs=3,
        per_device_train_batch_size=64,
        save_steps=10_000,
        save_total_limit=4,
        prediction_loss_only=True,
        evaluation_strategy="steps",
        eval_steps=5_000,
        report_to="all",
        per_gpu_eval_batch_size=8
    )

trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=None,
        eval_dataset=ds_val_grouped
    )

In [None]:
eval_res = trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
***** Running Evaluation *****
  Num examples = 720455
  Batch size = 8


KeyboardInterrupt: ignored

In [None]:
eval_res

In [None]:
# simple perplexity
np.exp(eval_res["eval_loss"])

In [29]:
np.exp(3.914557933807373)

50.12690720133711

### Preprocess Fu et al. 2017 data

In [None]:
ds_fu_raw = load_dataset('text', data_files={'train': "data/fu_raw_msgs_combined_cleaned.txt"})

In [None]:
ds_fu_raw_tok = ds_fu_raw["train"].map(tokenize_function, batched=True, num_proc=2, remove_columns=["text"])

In [None]:
ds_fu_raw_grouped = ds_fu_raw_tok.map(
        group_texts,
        fn_kwargs={"tokenizer": tokenizer},
        batched=True,
        batch_size=1000,
        num_proc=2
    )

   

#0:   0%|          | 0/981 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/981 [00:00<?, ?ba/s]

In [None]:
ds_fu_raw_grouped.save_to_disk("./data/fu_raw_roberta")

In [None]:
ds_fu_raw_grouped = datasets.load_from_disk("./data/fu_raw_roberta")

### Perplexity on Fu et al

In [None]:
training_args = TrainingArguments(
        output_dir="./RoBERTa/evaluation_test",
        overwrite_output_dir=False,
        num_train_epochs=3,
        per_device_train_batch_size=64,
        save_steps=10_000,
        save_total_limit=4,
        prediction_loss_only=True,
        evaluation_strategy="steps",
        eval_steps=5_000,
        report_to="all",
        per_gpu_eval_batch_size=8
    )

trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=None,
        eval_dataset=ds_fu_raw_grouped
    )

PyTorch: setting up devices


In [None]:
eval_res = trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
***** Running Evaluation *****
  Num examples = 125360
  Batch size = 8


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


In [None]:
eval_res

{'eval_loss': 3.915489912033081,
 'eval_runtime': 1410.3449,
 'eval_samples_per_second': 88.886,
 'eval_steps_per_second': 11.111}

In [None]:
# simple perplexity
np.exp(eval_res["eval_loss"])

50.17364616383546