In [11]:
# import libraries
import torch.optim as optim
import torch
import os
from transformers import AutoTokenizer, LineByLineTextDataset
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [2]:
SM_CHANNEL_TRAIN = "pretrain_data/"

In [3]:
# read tokenizer
tokenizer = AutoTokenizer.from_pretrained(os.path.join(SM_CHANNEL_TRAIN, "btokenizer"))

In [4]:
dataset= LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "pretrain_data/raw_text.txt",
    block_size = 256  # maximum sequence length
)



In [27]:
config = BertConfig(vocab_size=len(tokenizer))
model = BertForMaskedLM(config)
device = torch.device("cuda:0")
model.to(device)
print('No of parameters: '+str(model.num_parameters()) )

No of parameters: 124492880


In [28]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=True, mlm_probability=0.15)

In [29]:
training_args = TrainingArguments(output_dir="./output/output-electra/", overwrite_output_dir=True, 
                                  do_train=True, do_eval=False,
                                  per_device_train_batch_size=16, 
                                  num_train_epochs=10, log_level="info", 
                                  logging_dir="./output/logs-electra/", logging_strategy="steps",
                                  logging_steps=10000,
                                  save_strategy="epoch", 
                                  save_total_limit=15)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 8,056,227
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5,035,150
  Number of trainable parameters = 124,492,880


Step,Training Loss
10000,9.2279
20000,9.0844
30000,9.0781
40000,9.0834
50000,9.0098
60000,8.9692
70000,8.978
80000,8.9748
90000,8.9635
100000,8.9678
