In [None]:
!head -5 C_data/*.txt

In [None]:
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob("C_data/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=19_000, min_frequency=10, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

In [None]:
[str(x) for x in Path(".").glob("C_data/*.txt")]

In [None]:
# !mkdir LSCP_tokenizer
tokenizer.save_model("LSCP_tokenizer")

In [None]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./LSCP_tokenizer/vocab.json",
    "./LSCP_tokenizer/merges.txt",
)

In [None]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [None]:
tokenizer.encode("علی جمعهها خط قرمزشه. سه ساله میخوایم یه جمعه بریم خونش وقت نمیده بهمون")

In [None]:
tokenizer.encode("علی جمعهها خط قرمزشه. سه ساله میخوایم یه جمعه بریم خونش وقت نمیده بهمون").tokens

In [None]:
# Check that we have a GPU
!nvidia-smi

In [None]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

In [None]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=19_000,
    max_position_embeddings=258,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [None]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./LSCP_tokenizer", max_len=256)

In [None]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [None]:
model.num_parameters()
# => 84 million parameters

In [None]:
%%time
from transformers import LineByLineTextDataset

dataset_1 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_1000000.txt",
    block_size=128,
)
dataset_2 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_2000000.txt",
    block_size=128,
)
dataset_3 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_3000000.txt",
    block_size=128,
)
dataset_4 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_4000000.txt",
    block_size=128,
)
dataset_5 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_5000000.txt",
    block_size=128,
)
dataset_6 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_6000000.txt",
    block_size=128,
)
dataset_7 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_7000000.txt",
    block_size=128,
)
dataset_8 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_8000000.txt",
    block_size=128,
)
dataset_9 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_9000000.txt",
    block_size=128,
)
dataset_10 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_10000000.txt",
    block_size=128,
)
dataset_11 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_11000000.txt",
    block_size=128,
)
dataset_12 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_12000000.txt",
    block_size=128,
)
dataset_13 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_13000000.txt",
    block_size=128,
)
dataset_14 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_14000000.txt",
    block_size=128,
)
dataset_15 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_15000000.txt",
    block_size=128,
)
dataset_16 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_16000000.txt",
    block_size=128,
)
dataset_17 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_17000000.txt",
    block_size=128,
)
dataset_18 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_18000000.txt",
    block_size=128,
)
dataset_19 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_19000000.txt",
    block_size=128,
)
dataset_20 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_20000000.txt",
    block_size=128,
)
dataset_21 = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/small_file_21000000.txt",
    block_size=128,
)
dataset_eval = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="C_data/eval.txt",
    block_size=128,
)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./LSCP_small",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_gpu_train_batch_size=30,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset_1+dataset_2+dataset_3+dataset_4+dataset_5+dataset_6+dataset_7+dataset_8+dataset_9+dataset_10+dataset_11+dataset_12+dataset_13+dataset_14+dataset_15+dataset_16+dataset_17+dataset_18+dataset_19+dataset_20+dataset_21,
    eval_dataset = dataset_eval,
    prediction_loss_only=True,
)

In [None]:
%%time
trainer.train()

In [None]:
trainer.save_model("./LSCP")

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./LSCP/",
    tokenizer="./LSCP_tokenizer"
)

In [None]:
fill_mask("La suno <mask>.")