In [1]:
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

# paths = [str(x) for x in Path(".").glob("**/*.txt")]

In [2]:
# paths

In [3]:
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

In [4]:
# Customize training
tokenizer.train(files='oscar.eo.txt', vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [5]:
# Save files to disk
tokenizer.save_model(".", "esperberto")

['.\\esperberto-vocab.json', '.\\esperberto-merges.txt']

In [6]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

In [7]:
tokenizer = ByteLevelBPETokenizer(
    "esperberto-vocab.json",
    "esperberto-merges.txt",
)

In [8]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

In [9]:
tokenizer.enable_truncation(max_length=512)

print(
    tokenizer.encode("Mi estas Julien.")
)

Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [10]:
# from torch.utils.data import Dataset

# class EsperantoDataset(Dataset):
#     def __init__(self, evaluate: bool = False):
#         tokenizer = ByteLevelBPETokenizer(
#             "esperberto-vocab.json",
#             "esperberto-merges.txt",
#         )
#         tokenizer._tokenizer.post_processor = BertProcessing(
#             ("</s>", tokenizer.token_to_id("</s>")),
#             ("<s>", tokenizer.token_to_id("<s>")),
#         )
#         tokenizer.enable_truncation(max_length=512)
#         # or use the RobertaTokenizer from `transformers` directly.

#         self.examples = []

#         src_files = Path("./data/").glob("*-eval.txt") if evaluate else Path("./data/").glob("*-train.txt")
#         for src_file in src_files:
#             print("🔥", src_file)
#             lines = src_file.read_text(encoding="utf-8").splitlines()
#             self.examples += [x.ids for x in tokenizer.encode_batch(lines)]

#     def __len__(self):
#         return len(self.examples)

#     def __getitem__(self, i):
#         # We’ll pad at the batch level.
#         return torch.tensor(self.examples[i])

In [11]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [12]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("julien-c/EsperBERTo-small", max_len=512)

In [16]:
# !wget -c https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt

--2021-04-11 19:33:18--  https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt
Resolving cdn-datasets.huggingface.co (cdn-datasets.huggingface.co)... 54.192.147.62, 54.192.147.68, 54.192.147.25, ...
Connecting to cdn-datasets.huggingface.co (cdn-datasets.huggingface.co)|54.192.147.62|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 312733741 (298M) [text/plain]
Saving to: 'oscar.eo.txt'

     0K .......... .......... .......... .......... ..........  0%  113K 45m6s
    50K .......... .......... .......... .......... ..........  0%  238K 33m14s
   100K .......... .......... .......... .......... ..........  0% 2.52M 22m48s
   150K .......... .......... .......... .......... ..........  0%  257K 22m3s
   200K .......... .......... .......... .......... ..........  0% 2.46M 18m3s
   250K .......... .......... .......... .......... ..........  0%  254K 18m23s
   300K .......... .......... .......... .......... ..........  0%  252K 18m38s
   350K ........

In [13]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [14]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="oscar.eo.txt",
    block_size=128,
)

Wall time: 5min 33s


In [15]:
model.num_parameters()
# => 84 million parameters

83504416

In [16]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [19]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="EsperBERTo",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [20]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


ValueError: You can only call `wandb.watch` once per model.  Pass a new instance of the model if you need to call wandb.watch again in your code.

In [46]:
help(trainer.train)

Help on method train in module transformers.trainer:

train(resume_from_checkpoint: Union[str, bool, NoneType] = None, trial: Union[ForwardRef('optuna.Trial'), Dict[str, Any]] = None, **kwargs) method of transformers.trainer.Trainer instance
    Main training entry point.
    
    Args:
        resume_from_checkpoint (:obj:`str` or :obj:`bool`, `optional`):
            If a :obj:`str`, local path to a saved checkpoint as saved by a previous instance of
            :class:`~transformers.Trainer`. If a :obj:`bool` and equals `True`, load the last checkpoint in
            `args.output_dir` as saved by a previous instance of :class:`~transformers.Trainer`. If present,
            training will resume from the model/optimizer/scheduler states loaded here.
        trial (:obj:`optuna.Trial` or :obj:`Dict[str, Any]`, `optional`):
            The trial run or the hyperparameter dictionary for hyperparameter search.
        kwargs:
            Additional keyword arguments used to hide deprecat

In [None]:
trainer.save_model("EsperBERTo")

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="EsperBERTo",
    tokenizer="EsperBERTo"
)

In [None]:
# The sun <mask>.
# =>

fill_mask("La suno <mask>.")

In [None]:
fill_mask("Jen la komenco de bela <mask>.")

# This is the beginning of a beautiful <mask>.
# =>