In [1]:
%%bash
pip install -q datasets==2.10.1 transformers==4.26.1 tokenizers==0.12.1
pip list | grep --color "^dataset\|^transfor\|^tokeniz"



datasets                               2.10.1
tokenizers                             0.12.1
transformers                           4.26.1


In [2]:
import torch

In [45]:
USE_CUDA = torch.cuda.is_available()
#USE_CUDA = False
BATCH_SIZE = 64
#MAX_STEPS = 300
FP16 = USE_CUDA
DEVICE = torch.device("cuda") if USE_CUDA else torch.device("cpu")
#DEVICE = torch.device("cpu")
print(f'{USE_CUDA   = }')
print(f'{BATCH_SIZE = }')

USE_CUDA   = True
BATCH_SIZE = 64


### Tokenizer

Now let's load our tokenizer from Hugging Face Hub

In [4]:
from transformers import RobertaTokenizerFast

tokenizer_repo = "phunc20/esperoberta-cased"
tokenizer = RobertaTokenizerFast.from_pretrained(
    tokenizer_repo,
)
tokenizer.model_max_length

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading (…)okenizer_config.json:   0%|          | 0.00/311 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/478k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/279k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

512

**(?)** Is it normal to have such a big `tokenizer.model_max_length`?  


In [5]:
tokenizer.vocab_size

30000

## Load The Dataset

In [10]:
from pathlib import Path

# dataset_dir = Path("/kaggle/input/vina-cased-chunked-dataset")
# dataset_dir.exists()

In [11]:
from datasets import load_dataset

In [12]:
# from huggingface_hub import notebook_login

# notebook_login()

In [13]:
dataset_repo = "phunc20/oscar_esperoberta-cased_dataset"
dataset = load_dataset(dataset_repo)
dataset

  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 295376
    })
})

## The Model

In [14]:
from transformers import RobertaConfig

In [15]:
tokenizer.vocab_size

30000

In [10]:
tokenizer.bos_token_id, tokenizer.eos_token_id

(0, 2)

In [16]:
config = RobertaConfig(
    vocab_size=tokenizer.vocab_size,
    #max_position_embeddings=514,
    #num_attention_heads=12,
    #num_hidden_layers=6,
    #type_vocab_size=1,
)

**(?)** `type_vocab_size`? What is it?  

In [17]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)
print(f'{model.num_parameters():,d}')  # ~84 million parameters

109,112,880


Let's test its ability to guess masked sentences **before training**.

In [18]:
from transformers import pipeline

fill_mask = pipeline(
      "fill-mask",
      model=model,
      tokenizer=tokenizer,
)
# En 1831, havante 22 jarojn
result = fill_mask("""\
En 1831, havante 22 <mask>, ŝatanto de skaraboj sen direkto en vivo,
Darvino vojaĝis ĉirkaŭ la Tero en la ŝipo HMS Beagle,
dum kvin jaroj.\
""")
result

[{'score': 0.00030039899866096675,
  'token': 28240,
  'token_str': ' Kart',
  'sequence': 'En 1831, havante 22 Kart, ŝatanto de skaraboj sen direkto en vivo, Darvino vojaĝis ĉirkaŭ la Tero en la ŝipo HMS Beagle, dum kvin jaroj.'},
 {'score': 0.0002480414404999465,
  'token': 10883,
  'token_str': ' 1947',
  'sequence': 'En 1831, havante 22 1947, ŝatanto de skaraboj sen direkto en vivo, Darvino vojaĝis ĉirkaŭ la Tero en la ŝipo HMS Beagle, dum kvin jaroj.'},
 {'score': 0.00023199191491585225,
  'token': 25307,
  'token_str': 'naskiĝinta',
  'sequence': 'En 1831, havante 22naskiĝinta, ŝatanto de skaraboj sen direkto en vivo, Darvino vojaĝis ĉirkaŭ la Tero en la ŝipo HMS Beagle, dum kvin jaroj.'},
 {'score': 0.0002099921548506245,
  'token': 15543,
  'token_str': 'ĥu',
  'sequence': 'En 1831, havante 22ĥu, ŝatanto de skaraboj sen direkto en vivo, Darvino vojaĝis ĉirkaŭ la Tero en la ŝipo HMS Beagle, dum kvin jaroj.'},
 {'score': 0.00020650227088481188,
  'token': 2599,
  'token_str': 'He

## Data Collator

In [19]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15,
)

> Data collator expects as input **a list of dictionaries**

In [20]:
n_samples = 2
samples = [dataset["train"][i] for i in range(n_samples)]
# print(f'{samples = }')
for sample in samples:
    #import ipdb; ipdb.set_trace()
    del sample["word_ids"]

In [21]:
collated_samples = data_collator(samples)
collated_samples.keys()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


dict_keys(['input_ids', 'attention_mask', 'labels'])

Note that
- In `input_ids` there are about `mlm_probability` ratio of mask ids, `tokenizer.mask_token_id`
- In the corresponding position in `labels`, we can see the "correct" token id of the masked token. All the unmasked
  positions have `-100` as label, i.e. to be ignored

In [22]:
for chunk, label in zip(collated_samples["input_ids"],
                        collated_samples["labels"]):
    gt = []
    for c, l in zip(chunk, label):
        if c is None:
            continue
        if l == -100:
            gt.append(c)
        else:
            gt.append(l)
    gt = tokenizer.decode(
        gt,
        skip_special_tokens=True,
    )
    print(f'(label) {gt}')
    print()
    print(f"(chunk) '{tokenizer.decode(chunk)}'")

(label) Ĉu... preĝi | mediti | ricevi instigojn || kanti | muziki || informiĝi | legi | studi || prepari DiservonTemas pri kolekto de kristanaj kantoj, eldonita de Adolf Burkhardt inter 1974 kaj 1990 en dek kajeretoj. Ili estas reeldonitaj inter 1995 kaj 1998 de Bernhard Eichkorn en tri kajeroj, kies tria estas pliampleksigita per Dek Novaj Kantoj kaj suplemento, same de Adolf Burkhardt.En la dua kaj tria kajero oni adiciis 300 al la originaj kantonumeroj, por ke oni povu pli facile uzi la kajerojn kune kun la KELI-himnaro Adoru Kantante,

(chunk) '<s>Ĉu... preĝi | mediti<mask> ricevi instigojn || kanti<mask> muziki ||<mask> | legi | studi || prepari<mask></s><s>Temas pri kolekto de<mask><mask>, eldonita<mask> Adolf Burkhardt inter 1974 kaj 1990 en dek kajeretoj. Ili estas reeldonitaj inter 1995 kaj 1998 de Bernhard<mask>korn en tri kajeroj, kies<mask> estas pliampleksigita per Dek Novaj Kantoj kaj suplemento<mask> same de Adolf Burk<mask>.</s><s>En la dua<mask> tria kajero oni<mask>ci

**(?)** Why there are those `"�"`?

**Rmk.** Also, you might be interested in making sure only Latin-alphabet tokens are masked because masking characters from other languages, say, Chinese, wouldn't really help with our ultimate (Vietnamese) task.

In [23]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

## Downsampled Dataset

In [25]:
dataset = dataset.remove_columns(["word_ids"])

In [26]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 295376
    })
})

In [27]:
#train_size = 1_000
#test_size = 100

downsampled_dataset = dataset["train"].train_test_split(
    #train_size=train_size,
    #test_size=test_size,
    test_size=0.01,
    seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 292422
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2954
    })
})

## `TrainingArguments`

In [28]:
from transformers import TrainingArguments

In [29]:
TrainingArguments?

[0;31mInit signature:[0m
[0mTrainingArguments[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0moutput_dir[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moverwrite_output_dir[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdo_train[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdo_eval[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdo_predict[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mevaluation_strategy[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mtransformers[0m[0;34m.[0m[0mtrainer_utils[0m[0;34m.[0m[0mIntervalStrategy[0m[0;34m,[0m [0mstr[0m[0;34m][0m [0;34m=[0m [0;34m'no'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprediction_loss_only[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0

In [32]:
from pathlib import Path
data_dir = Path("/kaggle/workin")
#data_dir = Path("/tmp")

In [33]:
model_name = tokenizer_repo.split("/")[-1]
model_name

'esperoberta-cased'

In [46]:
#model_name = "esperoberta-cased"
#(data_dir/model_name).mkdir(exist_ok=True)
# num_logging_times = 10

training_args = TrainingArguments(
    output_dir=data_dir/model_name,
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    do_train=True,
    do_eval=True,
    no_cuda=not USE_CUDA,
    #learning_rate=2e-5,
    learning_rate=1e-4,
    #num_train_epochs=1.0,
    num_train_epochs=5.0,
    #max_steps=MAX_STEPS,
    #max_steps=21_000,
    weight_decay=0.01,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    #push_to_hub=True,
    #fp16=True,  # only on CUDA dMAX_STEPS
    #logging_steps=MAX_STEPS//num_logging_times,
    #save_steps=MAX_STEPS//num_logging_times,
    fp16=FP16,
    logging_steps=200,
    save_steps=200,
    eval_steps=200,
    save_total_limit=4,
    #load_best_model_at_end=True,
    #remove_unused_columns=False,
    remove_unused_columns=True,
    #push_to_hub=tokenizer_repo,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


## Trainer

In [47]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Using cuda_amp half precision backend


Perplexity before training

In [48]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 2954
  Batch size = 64


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


>>> Perplexity: 2956.07


**(?)** `16`? Number of steps, right?  
**(R)** Yes, approximately: $\frac{1000}{64} \approx 16$

In [43]:
len(downsampled_dataset["test"]) // BATCH_SIZE

92

In [None]:
trainer.train()

***** Running training *****
  Num examples = 292422
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 22850
  Number of trainable parameters = 109112880


Step,Training Loss,Validation Loss
200,7.9893,7.873196
400,7.851,7.817106


***** Running Evaluation *****
  Num examples = 2954
  Batch size = 64
Saving model checkpoint to /kaggle/workin/esperoberta-cased/checkpoint-200
Configuration saved in /kaggle/workin/esperoberta-cased/checkpoint-200/config.json
Model weights saved in /kaggle/workin/esperoberta-cased/checkpoint-200/pytorch_model.bin
tokenizer config file saved in /kaggle/workin/esperoberta-cased/checkpoint-200/tokenizer_config.json
Special tokens file saved in /kaggle/workin/esperoberta-cased/checkpoint-200/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2954
  Batch size = 64
Saving model checkpoint to /kaggle/workin/esperoberta-cased/checkpoint-400
Configuration saved in /kaggle/workin/esperoberta-cased/checkpoint-400/config.json
Model weights saved in /kaggle/workin/esperoberta-cased/checkpoint-400/pytorch_model.bin
tokenizer config file saved in /kaggle/workin/esperoberta-cased/checkpoint-400/tokenizer_config.json
Special tokens file saved in /kaggle/workin/esperoberta-cased

In [43]:
ansi_code = {
    #"red":    "\033[91m",
    "red":    "\x1b[91m",
    "green":  "\033[92m",
    "yellow": "\033[93m",
    "blue":   "\033[94m",
    "pink":   "\033[95m",
    "teal":   "\033[96m",
    "grey":   "\033[97m",
    #"reset":    "\x1b[0m",
    "reset":    "\033[0m",
}

def infer_viz2(sample, k=5):
    inputs = data_collator([sample])
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    masked_text = tokenizer.decode(inputs["input_ids"][0])
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
    #import ipdb; ipdb.set_trace()
    clone = torch.clone(inputs["input_ids"])
    clone[0, mask_token_index] = inputs["labels"][0, mask_token_index]
    labels = tokenizer.decode(clone[0])


    token_logits = model(**inputs).logits
    mask_token_logits = token_logits[0, mask_token_index, :]
    # Pick the <mask> candidates with the highest logits
    top_k_tokens = torch.topk(mask_token_logits, k, dim=1).indices[0].tolist()

    #print("(masked)")
    #print(f'"{masked_text}"', end="\n\n")
    #
    #print("(gt)")
    #print(f'"{labels}"', end="\n\n")
    
    #print("(preds)")
    for token in top_k_tokens:
        replacement = f'{ansi_code["red"]}{tokenizer.decode([token])}{ansi_code["reset"]}'
        print(f"'{masked_text.replace(tokenizer.mask_token, replacement)}'")
        print()

In [44]:
import random

random.seed(42)
i = random.randint(0, len(downsampled_dataset["test"])-1)
sample = downsampled_dataset["test"][i]
print(sample)

{'input_ids': [11, 83, 11, 78, 324, 10227, 74, 11, 83, 4195, 11, 220, 213, 416, 11, 2314, 11, 1091, 11, 83, 11, 78, 11, 82, 16, 408, 2307, 379, 11410, 75, 11, 220, 389, 11, 82, 313, 2119, 17958, 11, 83, 30, 16512, 1739, 11, 220, 5728, 11, 69, 11, 82, 5805, 11, 83, 11, 82, 18, 2, 0, 4158, 11, 82, 1746, 11, 1579, 11, 73, 16, 213, 2865, 11, 83, 324, 213, 6064, 11, 1091, 11, 233, 11, 83, 404, 11, 361, 11, 220, 213, 12900, 11, 83, 11, 82, 18, 337, 684, 11, 24305, 11, 83, 11, 78, 2759, 4699, 237, 1236, 25540, 324, 4135, 11, 83, 4195, 11, 220, 391, 213, 403, 11, 69, 11, 82, 18, 2, 0, 954, 3293, 626, 395, 313, 2119], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [45]:
infer_viz2(sample)

'[91m la[0mo'j kun[91m la[0m[91m la[0m[91m la[0mo rigard'is la ĉe[91m la[0mest'ant'[91m la[0m[91m la[0mj'n, sed baldaŭ ek'reg[91m la[0mis ili'[91m la[0m mal'trankvil'o: Ramzes hav'is[91m la[0m'a'n vizaĝ'o'n.</s><s>Tio'[91m la[0m dir'int[91m la[0m[91m la[0m, la[91m la[0m'o kun la[91m la[0m[91m la[0mant'ar'[91m la[0m[91m la[0m'las'is[91m la[0m salon'o'n. La ĉef'pastr'o'j Herhor[91m la[0m Mefres kun tim'o rigard[91m la[0mis[91m la[0m la ali'a'n.</s><s>— Kial do ni[91m la[0m[91m la[0m'



'[91m.[0mo'j kun[91m.[0m[91m.[0m[91m.[0mo rigard'is la ĉe[91m.[0mest'ant'[91m.[0m[91m.[0mj'n, sed baldaŭ ek'reg[91m.[0mis ili'[91m.[0m mal'trankvil'o: Ramzes hav'is[91m.[0m'a'n vizaĝ'o'n.</s><s>Tio'[91m.[0m dir'int[91m.[0m[91m.[0m, la[91m.[0m'o kun la[91m.[0m[91m.[0mant'ar'[91m.[0m[91m.[0m'las'is[91m.[0m salon'o'n. La ĉef'pastr'o'j Herhor[91m.[0m Mefres kun tim'o rigard[91m.[0mis[91m.[0m la ali'a'n.</s><s>— Kial do

In [46]:
i = random.randint(0, len(downsampled_dataset["train"])-1)
sample = downsampled_dataset["train"][i]
print(sample)

{'input_ids': [970, 1413, 16, 2183, 8679, 223, 1769, 5049, 255, 299, 7187, 2984, 2219, 914, 520, 2, 0, 394, 8679, 223, 1769, 5049, 255, 299, 11795, 30, 2408, 814, 1017, 223, 1769, 1204, 23604, 4111, 343, 213, 492, 1567, 330, 213, 5613, 13232, 1159, 443, 6017, 308, 17, 69, 4975, 28825, 256, 1943, 18, 337, 8679, 5183, 213, 20744, 8944, 377, 1680, 237, 213, 17943, 6575, 3169, 18, 788, 343, 14977, 213, 2873, 219, 223, 2765, 8331, 16, 11428, 1664, 2380, 538, 213, 12136, 223, 3142, 20844, 16088, 24278, 18, 337, 970, 2084, 2766, 23317, 757, 436, 213, 1771, 2741, 223, 308, 17, 69, 6861, 7645, 12920, 4445, 450, 18, 699, 28, 65, 3350, 213, 26450, 9970, 16, 14977, 213, 1715, 219, 223, 2592, 8786, 16, 5183, 213, 673], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [48]:
infer_viz2(sample)

'[91m la[0m uzas,[91m la[0m Traktato[91m la[0m[91m la[0m[91m la[0m[91m la[0m[91m la[0mmalambiguigo).</s><s>[91m la[0m Traktato de Versajlo[91m la[0m france: Traité de[91m la[0m[91m la[0m[91m la[0m ) estis[91m la[0m plej grava el la packontraktoj[91m la[0m[91m la[0m[91m la[0m[91m la[0ma Mondmiliton al fino. La[91m la[0m finis la militan staton inter Germanio kaj la Aliancitaj ŝtatoj.[91m la[0m estis subskribita la 28an de junio 1919, ekzakte kvin jarojn[91m la[0m la murdo de arkiduko Franz Ferdinand.[91m la[0m[91m la[0m Centraj Potencoj sur la germana flanko de 1-[91m la[0m Mondmilito subskribis apartajn traktatojn. [8] Kvankam la armistico[91m la[0m subskribita la 11an de novembro 1918, finis la fa'



'[91m.[0m uzas,[91m.[0m Traktato[91m.[0m[91m.[0m[91m.[0m[91m.[0m[91m.[0mmalambiguigo).</s><s>[91m.[0m Traktato de Versajlo[91m.[0m france: Traité de[91m.[0m[91m.[0m[91m.[0m ) estis[91m.[0m plej grava el la packontr