In [1]:
import os 
os.chdir('..')

%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
from transformers import PreTrainedTokenizerFast
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
import math
from collections import defaultdict
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!ls tokenizers

babylm-10M-BPE	     babylm_10M_BPE-merges.txt	babylm_10M_sentencepiece.json
babylm_10M_BPE.json  babylm_10M_BPE-vocab.json	babylm_10M_wordpiece.json


In [3]:
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path("../datasets/babylm_10M/").glob("*.train")]
paths

['../datasets/babylm_10M/bnc_spoken.train',
 '../datasets/babylm_10M/children_stories.train',
 '../datasets/babylm_10M/cbt.train',
 '../datasets/babylm_10M/switchboard.train',
 '../datasets/babylm_10M/wikipedia.train',
 '../datasets/babylm_10M/gutenberg.train',
 '../datasets/babylm_10M/aochildes.train',
 '../datasets/babylm_10M/qed.train',
 '../datasets/babylm_10M/simple_wikipedia.train',
 '../datasets/babylm_10M/open_subtitles.train']

In [4]:
%%time 

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])




CPU times: user 39.2 s, sys: 2.86 s, total: 42.1 s
Wall time: 3.58 s


In [5]:
# !mkdir bert_baseline
tokenizer.save_model("./bert_baseline")

['./bert_baseline/vocab.json', './bert_baseline/merges.txt']

In [6]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./bert_baseline/vocab.json",
    "./bert_baseline/merges.txt",
)

In [7]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [8]:
tokenizer.encode("hello world.").tokens

['<s>', 'hello', 'Ġworld', '.', '</s>']

In [9]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [10]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./bert_baseline/", max_len=512)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.


In [11]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [12]:
model.num_parameters()
# => 84 million parameters

83504416

In [13]:
!ls "../datasets/babylm_10M/"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
aochildes.train		gutenberg.train		switchboard.train
bnc_spoken.train	open_subtitles.train	wikipedia.train
cbt.train		qed.train
children_stories.train	simple_wikipedia.train


In [14]:
!cat ../datasets/babylm_10M/* > ../datasets/babylm_10M_merged.train 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
!head ../datasets/babylm_10M_merged.train 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
aboo .
kee .
hey .
smile ?
hm hm .
smile .
hi .
aguh .
mguh !
mguh ?


In [16]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="../datasets/babylm_10M_merged.train",
    block_size=128,
)



CPU times: user 52.7 s, sys: 2.79 s, total: 55.5 s
Wall time: 15.3 s


In [18]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert_baseline/",
    overwrite_output_dir=True,
    learning_rate=5e-5
    per_device_train_batch_size=64,
    num_train_epochs=10,
    save_steps=2000,
    save_total_limit=50,
    seed=12
    evaluate_during_training=True,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


In [19]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 1015503
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 15868
  Number of trainable parameters = 83504416
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,7.4632
1000,6.6813
1500,6.3908
2000,6.1416
2500,5.9727
3000,5.8265
3500,5.6557
4000,5.6179
4500,5.4795
5000,5.4203


Saving model checkpoint to ./bert_baseline/checkpoint-10000
Configuration saved in ./bert_baseline/checkpoint-10000/config.json
Model weights saved in ./bert_baseline/checkpoint-10000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




CPU times: user 39min 36s, sys: 1.93 s, total: 39min 38s
Wall time: 39min 34s


TrainOutput(global_step=15868, training_loss=5.326497113761777, metrics={'train_runtime': 2374.2064, 'train_samples_per_second': 427.723, 'train_steps_per_second': 6.683, 'total_flos': 2.522244857812992e+16, 'train_loss': 5.326497113761777, 'epoch': 1.0})

In [20]:
trainer.save_model("./bert_baseline")

Saving model checkpoint to ./bert_baseline
Configuration saved in ./bert_baseline/config.json
Model weights saved in ./bert_baseline/pytorch_model.bin


In [21]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./bert_baseline",
    tokenizer="./bert_baseline"
#     tokenizer="./tokenizers/babylm-10M-BPE"
)

loading configuration file ./bert_baseline/config.json
Model config RobertaConfig {
  "_name_or_path": "./bert_baseline",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

loading configuration file ./bert_baseline/config.json
Model config RobertaConfig {
  "_name_or_path": "./bert_baseline",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classif

In [22]:
fill_mask("Happy <mask> year.")

[{'score': 0.07110521197319031,
  'token': 263,
  'token_str': ' a',
  'sequence': 'Happy a year.'},
 {'score': 0.06310326606035233,
  'token': 265,
  'token_str': ' the',
  'sequence': 'Happy the year.'},
 {'score': 0.06133388727903366,
  'token': 316,
  'token_str': "'s",
  'sequence': "Happy's year."},
 {'score': 0.028491629287600517,
  'token': 389,
  'token_str': ' this',
  'sequence': 'Happy this year.'},
 {'score': 0.022748861461877823,
  'token': 457,
  'token_str': ' one',
  'sequence': 'Happy one year.'}]

In [23]:
fill_mask("Good <mask>.")

[{'score': 0.03506023809313774,
  'token': 984,
  'token_str': ' night',
  'sequence': 'Good night.'},
 {'score': 0.027495840564370155,
  'token': 587,
  'token_str': ' time',
  'sequence': 'Good time.'},
 {'score': 0.023821823298931122,
  'token': 457,
  'token_str': ' one',
  'sequence': 'Good one.'},
 {'score': 0.02028697542846203,
  'token': 882,
  'token_str': ' life',
  'sequence': 'Good life.'},
 {'score': 0.019493918865919113,
  'token': 730,
  'token_str': ' day',
  'sequence': 'Good day.'}]