In [2]:
import os
import argparse

from tokenizers import (
    ByteLevelBPETokenizer,
    Tokenizer,
    models,
    trainers,
)

## importing the tokenizer and subword BPE trainer
from tokenizers import Tokenizer
from tokenizers.models import BPE, Unigram, WordLevel, WordPiece
from tokenizers.trainers import (
    BpeTrainer,
    WordLevelTrainer,
    WordPieceTrainer,
    UnigramTrainer,
)

## a pretokenizer to segment the text into words
from tokenizers.pre_tokenizers import Whitespace

In [3]:
datasets = [
    "aochildes",
    "bnc_spoken",
    "cbt",
    "children_stories",
    "gutenberg",
    "open_subtitles",
    "qed",
    "simple_wikipedia",
    "switchboard",
    "wikipedia",
]

In [44]:
args = dict()
args[
    "data_root"
] = "/Users/lukas/Desktop/Projects/BabyLM/BabyLMChallenge/data/babylm_data"
args["train_dir"] = "babylm_100M"
args["dev_dir"] = "babylm_dev"
args["test_dir"] = "babylm_test"

In [45]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""

    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__


args = dotdict(args)

In [48]:
# Define the directories for train, dev, and test
data_root = args.data_root
train_dir = os.path.join(data_root, args.train_dir)
dev_dir = os.path.join(data_root, args.dev_dir)
test_dir = os.path.join(data_root, args.test_dir)

# Accumulate file paths for all datasets and splits
files = []
for dataset in datasets:
    train_file = os.path.join(train_dir, f"{dataset}.train")
    dev_file = os.path.join(dev_dir, f"{dataset}.dev")
    test_file = os.path.join(test_dir, f"{dataset}.test")
    files.extend([train_file, dev_file, test_file])

In [49]:
# load all texts from files into a list

texts = []
for file in files:
    with open(file, "r") as f:
        texts.extend(f.readlines())

In [53]:
# batch the texts into chunks of 1000
# this is to avoid memory issues
# when training the tokenizer

batch_size = 1000
batches = [texts[i : i + batch_size] for i in range(0, len(texts), batch_size)]

In [54]:
# iterator over the texts
def text_iterator():
    for batch in batches:
        yield batch

In [55]:
# load old tokenizer
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("facebook/flava-full")

In [56]:
old_tokenizer.vocab_size

30522

In [59]:
tokens = old_tokenizer.tokenize(texts[999000])
tokens

['she', "'", 's', 'alright', 'now', '?']

In [66]:
old_tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [65]:
tokenizer = old_tokenizer.train_new_from_iterator(
    text_iterator(),
    vocab_size=old_tokenizer.vocab_size,
    special_tokens_map=special_tokens_map,
)

AttributeError: 'BertTokenizerFast' object has no attribute 'get_special_tokens'

In [60]:
tokens = tokenizer.tokenize(texts[999000])
tokens

['she', "'", 's', 'alright', 'now', '?']

In [61]:
# save tokenizer
tokenizer.save_pretrained("./tokenizer-trained-babylm_100M")

('./tokenizer-trained-babylm_100M/tokenizer_config.json',
 './tokenizer-trained-babylm_100M/special_tokens_map.json',
 './tokenizer-trained-babylm_100M/vocab.txt',
 './tokenizer-trained-babylm_100M/added_tokens.json',
 './tokenizer-trained-babylm_100M/tokenizer.json')

In [67]:
loaded_tokenizer = AutoTokenizer.from_pretrained("./tokenizer-trained-babylm_100M")

In [68]:
loaded_tokenizer

BertTokenizerFast(name_or_path='./tokenizer-trained-babylm_100M', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)