In [1]:
from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM, LineByLineTextDataset, DataCollatorForLanguageModeling, TrainingArguments, Trainer
import subprocess
import torch
import time
import glob
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DRIVER_LETTER = os.path.splitdrive(os.getcwd())[0]

In [3]:
torch.cuda.empty_cache()


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.get_device_name(0)
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

NVIDIA GeForce RTX 2080
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [4]:
config_ = {
    'lr': 5e-5,
    'batch_size': 32,
    'epoch': 30,
    'weight_decay': 0.001
}

# Directory

In [5]:
logging_dir = "./logs"
output_dir = "./pretrained_bert"

# Load GPU Enabled Device

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Load Tokenizer and Model

In [7]:
# Used 'cased' as some Entity will have Upper Case
tokenizer = BertTokenizerFast.from_pretrained(f"./men-tokenizer")

config = BertConfig(
    vocab_size=50000,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    max_position_embeddings=512
)

model = BertForMaskedLM(config)

model.to(device)
print('No of parameters: ', model.num_parameters())

No of parameters:  124492880


# Load Dataset (80%-20% Splitting)

In [8]:
train_dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = f"../../corpus/train.txt",
    block_size = 128
)
eval_dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = f"../../corpus/test.txt",
    block_size = 128
)



In [9]:
# Create a data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Load BERT Configuration

In [10]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=config_["epoch"],
    per_device_train_batch_size=config_["batch_size"],
    per_device_eval_batch_size=config_["batch_size"],
    save_total_limit=2,
    prediction_loss_only=True,
    logging_dir=logging_dir,
    learning_rate=config_["lr"],
    weight_decay=config_['weight_decay'],
    logging_first_step=True,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    resume_from_checkpoint=True,
    fp16=True,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="adafactor",
    report_to="wandb",
    #optim="adamw_bnb_8bit",
    run_name="MENBERT-SC",
    disable_tqdm=False  # Disable tqdm progress bar if desired
)

In [11]:
# Create a trainer for pre-training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

In [12]:
%%time

# Start pre-training
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmohanrj-nlp[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,8.249,6.835558
1,6.5946,6.479207
2,6.4017,6.323391
4,6.2717,6.139711
5,6.1316,6.087021
6,6.0553,6.003584
8,6.0365,5.903749
9,5.8983,5.854782
10,5.8631,5.818561
12,5.8363,5.724998




CPU times: total: 47min 58s
Wall time: 48min 29s


TrainOutput(global_step=2670, training_loss=5.808804988146721, metrics={'train_runtime': 2909.4636, 'train_samples_per_second': 118.434, 'train_steps_per_second': 0.918, 'total_flos': 2.248951429160141e+16, 'train_loss': 5.808804988146721, 'epoch': 29.75})

In [15]:
trainer.save_model('./final_bert')

In [16]:
tokenizer.save_pretrained("./final_vocab")

('./final_vocab\\tokenizer_config.json',
 './final_vocab\\special_tokens_map.json',
 './final_vocab\\vocab.txt',
 './final_vocab\\added_tokens.json',
 './final_vocab\\tokenizer.json')