In [1]:
import nltk

files = [
    '../data/processed/oshhamaho.txt',
    '../data/processed/apkbr_ru.txt',
]

sentences = set()

for file in files:
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()

    for sentence in nltk.sent_tokenize(text):
        sentences.add(sentence)

In [2]:
import pandas as pd
from datasets import Dataset

df = pd.DataFrame(sentences, columns=['text'])

dataset = Dataset.from_pandas(df)

In [3]:
from transformers import AutoTokenizer

tokenizer_base = AutoTokenizer.from_pretrained("bert-base-cased")

tokenizer = tokenizer_base.train_new_from_iterator(sentences, vocab_size=50000, min_frequency=2)






In [4]:
tokenizer.tokenize('и шэджагъуэ дыгъапIэр къохьэлъэкIыу хуабэ хъуми')

['и', 'шэджагъуэ', 'дыгъа', '##пIэр', 'къохьэлъэ', '##кIыу', 'хуабэ', 'хъуми']

In [5]:
tokenizer.save_pretrained('mlm_bert')

('mlm_bert/tokenizer_config.json',
 'mlm_bert/special_tokens_map.json',
 'mlm_bert/vocab.txt',
 'mlm_bert/added_tokens.json',
 'mlm_bert/tokenizer.json')

In [16]:
from transformers import BertConfig, BertForMaskedLM

config = BertConfig(
    hidden_size=128,
    vocab_size=50000,
    num_hidden_layers=4,
    num_attention_heads=4,
    intermediate_size=256,
    max_position_embeddings=64
)

model = BertForMaskedLM(config=config)
print(model.num_parameters())  #12382864

7005392


In [17]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [18]:
import torch
from torch.utils.data import Dataset
from accelerate import Accelerator


class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer, raw_datasets, max_length: int):
        self.padding = "max_length"
        self.text_column_name = 'text'
        self.max_length = max_length
        self.accelerator = Accelerator(gradient_accumulation_steps=1)
        self.tokenizer = tokenizer

        with self.accelerator.main_process_first():
            self.tokenized_datasets = raw_datasets.map(
                self.tokenize_function,
                batched=True,
                num_proc=4,
                remove_columns=[self.text_column_name],
                desc="Running tokenizer on dataset line_by_line",
            )
            self.tokenized_datasets.set_format('torch', columns=['input_ids'], dtype=torch.long)

    def tokenize_function(self, examples):
        examples[self.text_column_name] = [
            line for line in examples[self.text_column_name] if len(line[0]) > 0 and not line[0].isspace()
        ]
        return self.tokenizer(
            examples[self.text_column_name],
            padding=self.padding,
            truncation=True,
            max_length=self.max_length,
            return_special_tokens_mask=True,
        )

    def __len__(self):
        return len(self.tokenized_datasets)

    def __getitem__(self, i):
        return self.tokenized_datasets[i]

In [19]:
tokenized_dataset_train = LineByLineTextDataset(
    tokenizer=tokenizer,
    raw_datasets=dataset,
    max_length=64,
)

Running tokenizer on dataset line_by_line (num_proc=4):   0%|          | 0/615271 [00:00<?, ? examples/s]

In [20]:
dataset[34344]

{'text': 'Ауэ пэжыр пэжщ: псори зэхуэхьэсауэ, дэтхэнэми нэхъ гуакIуэу къыдекIуэкIыр къыхакъузыкIарэ, зэгъэуIуауэ зым и деж щызэхуэхьэсыжауэ, – апхуэдэ макъ зэи зыми зэхихатэкъым.'}

In [21]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./mlm_bert",
    overwrite_output_dir=True,
    # push_to_hub=True,
    # hub_model_id="Ransaka/sinhala-bert-yt-comments",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    save_steps=5_000,
    logging_steps=500,
    save_total_limit=5,
    use_mps_device=True,  # disable this if you're running non-mac env
    hub_private_repo=False,  # please set true if you want to save model privetly
    save_safetensors=True,
    learning_rate=1e-4,
    # report_to='wandb'
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset_train
)



In [22]:
trainer.train()

Step,Training Loss
500,0.3974


KeyboardInterrupt: 