## `requirements.txt`

In [1]:
!cat requirements.txt

jupyterlab==4.0.2
tokenizers==0.7.0
torch==2.0.1
transformers==2.11.0


In [2]:
!pip list | grep "^tokenizers\|^transformers\|^torch\>"

tokenizers               0.7.0
torch                    2.0.1
transformers             2.11.0


## Data Source

In [3]:
from pathlib import Path

data_dir = Path("../../data/")
assert data_dir.is_dir()

In [4]:
URL = "https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt"
file_name = URL.split("/")[-1]
file_name

'oscar.eo.txt'

No-progress-bar request

progress-bar request

In [5]:
file_path = data_dir/file_name

if file_path.exists():
    pass
else:
    import requests
    from tqdm.auto import tqdm

    response = requests.get(URL, stream=True)
    total_size_in_bytes= int(response.headers.get("content-length", 0))
    #print(f'{total_size_in_bytes = :,}')
    block_size = 1024
    progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
    with open(file_path, "wb") as file:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            file.write(data)
    progress_bar.close()

    if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
        print("ERROR, something went wrong")
        file_path

## Tokenizers

The next cell will take some time to run, and w/o progress bar, even though there is an input
argument `show_progress` which defaults to `True` for the method `train`.

**Rmk.** According to [this issue from GitHub](https://github.com/huggingface/tokenizers/issues/157),
it seems that the **progress bar works fine** when running as a Python script **in terminal**, but **fails in notebook**.

In [6]:
tokenizer_dir = Path("EsperBERTo")
tokenizer_dir.mkdir(exist_ok=True)

In [7]:
from tokenizers import ByteLevelBPETokenizer

no_trained_tokenizer = list(tokenizer_dir.glob("*")) == []
if no_trained_tokenizer:
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=[str(file_path)], vocab_size=52_000, min_frequency=2, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])
    tokenizer.save(str(tokenizer_dir))

### Reload tokenizer and add `post_processor`

In [8]:
from tokenizers.implementations import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer(
    str(tokenizer_dir/"vocab.json"),
    str(tokenizer_dir/"merges.txt"),
)

In [9]:
encoding = tokenizer.encode("Mi estas Julien.")
encoding.tokens

['Mi', 'Ġestas', 'ĠJuli', 'en', '.']

In [10]:
from tokenizers.processors import BertProcessing

# tokenizer._tokenizer.post_processor = BertProcessing(
#     (tokenizer.eos_token, tokenizer.eos_token_id)
#     (tokenizer.bos_token, tokenizer.bos_token_id)
# )
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [11]:
encoding = tokenizer.encode("Mi estas Julien.")
encoding.tokens

['<s>', 'Mi', 'Ġestas', 'ĠJuli', 'en', '.', '</s>']

In [12]:
hasattr(tokenizer, "is_fast")

False

### Fast tokenizer

**(?)** But won't this make our previously defined `post_processor` disappear?  

In [13]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./EsperBERTo", max_len=512)

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
hasattr(tokenizer, "is_fast")

True

In [15]:
tokenizer.is_fast

True

## Dataset

In [16]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=file_path,
    #block_size=128,
    block_size=64,
)

In [17]:
dataset

<transformers.data.datasets.language_modeling.LineByLineTextDataset at 0x7f776483fd90>

**(?)** `block_size` is the context length?  

### Data Collator

In [18]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

## Train

In [19]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [20]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

### Package problem
`Python==3.8.16` with `transformers==2.11.0` cannot import `RobertaForMaskedLM`
```bash
ImportError: cannot import name 'RobertaForMaskedLM' from 'transformers' (/home/phunc20/.config/miniconda3/envs/lm_from_scratch/lib/python3.8/site-packages/transformers/__init__.py)
```

**Solution.** This is because you **need to install `torch`**. I have already included `torch` in `requirements.txt`
to avoid this.

In [21]:
import torch

torch.cuda.empty_cache()

In [22]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=str(tokenizer_dir),
    overwrite_output_dir=True,
    #num_train_epochs=1,
    max_steps=1_000,
    #per_gpu_train_batch_size=1,
    per_device_train_batch_size=1,
    save_steps=100,
    logging_steps=100,
    save_total_limit=3,
    #fp16=True,
    # prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [23]:
trainer.train()

Epoch:   0%|                                                                                                    | 0/1 [00:00<?, ?it/s]
Iteration:   0%|                                                                                           | 0/974545 [00:00<?, ?it/s][A
Iteration:   0%|                                                                               | 1/974545 [00:00<110:30:16,  2.45it/s][A
Iteration:   0%|                                                                               | 2/974545 [00:01<159:19:38,  1.70it/s][A
Iteration:   0%|                                                                               | 3/974545 [00:01<165:53:42,  1.63it/s][A
Iteration:   0%|                                                                               | 4/974545 [00:02<169:14:06,  1.60it/s][A
Iteration:   0%|                                                                               | 5/974545 [00:03<169:04:11,  1.60it/s][A
Iteration:   0%|                     

{"loss": 9.480108938217164, "learning_rate": 4.5e-05, "epoch": 0.00010261198815857657, "step": 100}



Iteration:   0%|                                                                             | 100/974545 [01:05<351:02:05,  1.30s/it][A
Iteration:   0%|                                                                             | 101/974545 [01:06<263:26:15,  1.03it/s][A
Iteration:   0%|                                                                             | 102/974545 [01:06<236:37:18,  1.14it/s][A
Iteration:   0%|                                                                             | 103/974545 [01:07<217:37:35,  1.24it/s][A
Iteration:   0%|                                                                             | 104/974545 [01:08<204:36:28,  1.32it/s][A
Iteration:   0%|                                                                             | 105/974545 [01:08<195:25:42,  1.39it/s][A
Iteration:   0%|                                                                             | 106/974545 [01:09<189:14:53,  1.43it/s][A
Iteration:   0%|                 

{"loss": 8.751515731811523, "learning_rate": 4e-05, "epoch": 0.00020522397631715314, "step": 200}



Iteration:   0%|                                                                             | 200/974545 [02:11<346:48:45,  1.28s/it][A
Iteration:   0%|                                                                             | 201/974545 [02:11<259:36:25,  1.04it/s][A
Iteration:   0%|                                                                             | 202/974545 [02:12<233:54:12,  1.16it/s][A
Iteration:   0%|                                                                             | 203/974545 [02:13<214:13:06,  1.26it/s][A
Iteration:   0%|                                                                             | 204/974545 [02:13<202:11:34,  1.34it/s][A
Iteration:   0%|                                                                             | 205/974545 [02:14<194:20:13,  1.39it/s][A
Iteration:   0%|                                                                             | 206/974545 [02:15<188:16:47,  1.44it/s][A
Iteration:   0%|                 

{"loss": 8.523752863407134, "learning_rate": 3.5e-05, "epoch": 0.0003078359644757297, "step": 300}



Iteration:   0%|                                                                             | 300/974545 [03:17<344:15:58,  1.27s/it][A
Iteration:   0%|                                                                             | 301/974545 [03:17<258:05:41,  1.05it/s][A
Iteration:   0%|                                                                             | 302/974545 [03:18<233:04:21,  1.16it/s][A
Iteration:   0%|                                                                             | 303/974545 [03:18<215:47:12,  1.25it/s][A
Iteration:   0%|                                                                             | 304/974545 [03:19<203:16:53,  1.33it/s][A
Iteration:   0%|                                                                             | 305/974545 [03:20<194:33:43,  1.39it/s][A
Iteration:   0%|                                                                             | 306/974545 [03:20<187:55:53,  1.44it/s][A
Iteration:   0%|                 

{"loss": 8.185723249912263, "learning_rate": 3e-05, "epoch": 0.0004104479526343063, "step": 400}



Iteration:   0%|                                                                             | 400/974545 [04:23<344:53:51,  1.27s/it][A
Iteration:   0%|                                                                             | 401/974545 [04:23<258:21:04,  1.05it/s][A
Iteration:   0%|                                                                             | 402/974545 [04:24<233:49:51,  1.16it/s][A
Iteration:   0%|                                                                             | 403/974545 [04:24<216:07:01,  1.25it/s][A
Iteration:   0%|                                                                             | 404/974545 [04:25<204:27:31,  1.32it/s][A
Iteration:   0%|                                                                             | 405/974545 [04:26<195:24:15,  1.38it/s][A
Iteration:   0%|                                                                             | 406/974545 [04:26<189:28:24,  1.43it/s][A
Iteration:   0%|                 

{"loss": NaN, "learning_rate": 2.5e-05, "epoch": 0.0005130599407928828, "step": 500}



Iteration:   0%|                                                                             | 500/974545 [05:29<349:34:58,  1.29s/it][A
Iteration:   0%|                                                                             | 501/974545 [05:29<262:05:07,  1.03it/s][A
Iteration:   0%|                                                                             | 502/974545 [05:30<235:32:52,  1.15it/s][A
Iteration:   0%|                                                                             | 503/974545 [05:30<217:09:01,  1.25it/s][A
Iteration:   0%|                                                                             | 504/974545 [05:31<204:30:47,  1.32it/s][A
Iteration:   0%|                                                                             | 505/974545 [05:32<194:00:54,  1.39it/s][A
Iteration:   0%|                                                                             | 506/974545 [05:32<188:15:56,  1.44it/s][A
Iteration:   0%|                 

{"loss": NaN, "learning_rate": 2e-05, "epoch": 0.0006156719289514594, "step": 600}



Iteration:   0%|                                                                             | 600/974545 [06:35<346:50:54,  1.28s/it][A
Iteration:   0%|                                                                             | 601/974545 [06:35<259:38:16,  1.04it/s][A
Iteration:   0%|                                                                             | 602/974545 [06:35<233:47:00,  1.16it/s][A
Iteration:   0%|                                                                             | 603/974545 [06:36<214:16:50,  1.26it/s][A
Iteration:   0%|                                                                             | 604/974545 [06:37<200:27:38,  1.35it/s][A
Iteration:   0%|                                                                             | 605/974545 [06:37<190:53:22,  1.42it/s][A
Iteration:   0%|                                                                             | 606/974545 [06:38<184:11:02,  1.47it/s][A
Iteration:   0%|                 

{"loss": NaN, "learning_rate": 1.5e-05, "epoch": 0.0007182839171100359, "step": 700}



Iteration:   0%|                                                                             | 700/974545 [07:40<346:46:58,  1.28s/it][A
Iteration:   0%|                                                                             | 701/974545 [07:41<259:45:15,  1.04it/s][A
Iteration:   0%|                                                                             | 702/974545 [07:41<234:44:39,  1.15it/s][A
Iteration:   0%|                                                                             | 703/974545 [07:42<216:27:12,  1.25it/s][A
Iteration:   0%|                                                                             | 704/974545 [07:42<203:43:25,  1.33it/s][A
Iteration:   0%|                                                                             | 705/974545 [07:43<193:06:10,  1.40it/s][A
Iteration:   0%|                                                                             | 706/974545 [07:44<187:28:04,  1.44it/s][A
Iteration:   0%|                 

KeyboardInterrupt: 

In [None]:
model.save_pretrained(tokenizer_dir)

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=str(tokenizer_dir),
    tokenizer=str(tokenizer_dir),
)

In [None]:
# The sun <mask>.
fill_mask("La suno <mask>.")

In [None]:
# This is the beginning of a beautiful <mask>.
fill_mask("Jen la komenco de bela <mask>.")