# Pre-training

In [2]:
import os

import torch # for cuda usage

import transformers
from transformers import RobertaConfig, RobertaTokenizer, RobertaForMaskedLM
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

from datasets import load_dataset

### Configuring model


We will pretrain a RoBERTa-style transformer model, configured with the same number of layers and attention heads as a DistilBERT model. The model will feature a vocabulary size of 52,000, 12 attention heads, and 6 layers.

In [4]:
config = RobertaConfig(vocab_size=20000,
                       max_position_embeddings=514,
                       num_attention_heads=12,
                       num_hidden_layers=6,
                       type_vocab_size=1,
                       )
print(config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 20000
}



### Tokenizer

Initialize tokenizer from the one I trained:

In [5]:
tokenizer = RobertaTokenizer.from_pretrained('../LawBERTarg', max_len=512)

In [6]:
print(tokenizer)

RobertaTokenizer(name_or_path='../LawBERTarg', vocab_size=13060, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}
)


In [7]:
print(tokenizer.vocab_size)

13060


Test it:

In [9]:
print(tokenizer.tokenize("n) Emitir recomendaciones pro-competitivas de carácter general o sectorial respecto a las modalidades de la competencia en los mercados;"))

['n', ')', 'ĠEmitir', 'Ġrecomend', 'aciones', 'Ġpro', '-', 'competitivas', 'Ġde', 'ĠcarÃ¡cter', 'Ġgeneral', 'Ġo', 'Ġsectorial', 'Ġrespecto', 'Ġa', 'Ġlas', 'Ġmodalidades', 'Ġde', 'Ġla', 'Ġcompetencia', 'Ġen', 'Ġlos', 'Ġmercados', ';']


Initialize model:

In [10]:
model = RobertaForMaskedLM(config=config)

In [11]:
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(20000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm)

In [12]:
print(model.num_parameters())

58896416


### Loading data for processing

Loading dataset:

In [13]:
file_names = ['../data/ley_bases.txt', '../data/constitucion-nacional.txt']

"""if os.path.exists('../data/') and os.path.isdir('../data/'):
    file_names = os.listdir('../data/')

file_names = ['../data/'+file_name for file_name in file_names]

file_names"""

"if os.path.exists('../data/') and os.path.isdir('../data/'):\n    file_names = os.listdir('../data/')\n\nfile_names = ['../data/'+file_name for file_name in file_names]\n\nfile_names"

Load the dataset to generate samples for batch training, with block_size=256 limiting the length of an example:

In [14]:
"""dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=combined_file,
    block_size=64
)"""

dataset = load_dataset(path="text",
                       data_files=file_names,
                       )

In [16]:
def tokenize_function(examples):
    return tokenizer(examples["text"], # select the text lines from dataset
                     truncation=True,
                     padding="max_length",
                     max_length=128)

# map tokenize function to loaded dataset
tokenized_dataset = dataset.map(tokenize_function,
                                batched=True,
                                remove_columns=["text"])
# now this tokenized dataset cas columns: input_ids, attention_masks, among other fields (text was removed)

Map:   0%|          | 0/10784 [00:00<?, ? examples/s]

Create a data collator, that will collate samples from the dataset to prepare batch processing.

Prepare for Masked Language Modeling by setting mlm=True and specifying a masking probability of 0.15, meaning 15% of tokens will be masked during pretraining.

In [17]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

### Training

Initialize trainer:

In [18]:
# instance of TrainingArguments that stores hyperparams
training_args = TrainingArguments(
    output_dir='../LawBERTarg/',
    overwrite_output_dir=True,
    num_train_epochs=1,
    learning_rate=5e-5,
    per_device_train_batch_size=16, # batch_size, low due to gpu memory restriction
    save_steps=2500,
    prediction_loss_only=True,
    label_smoothing_factor=0.1,
    warmup_steps=500,
    logging_steps=100,
)

In [19]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Using device: {device}")

Using device: cuda


In [20]:
trainer = Trainer(
    model=model, # the RoBERTa we initialized
    args=training_args,
    data_collator=data_collator,
    #train_dataset=dataset,
    train_dataset=tokenized_dataset["train"],
)

Start pre-training:

In [21]:
%%time
trainer.train()

Step,Training Loss
100,9.5986
200,8.5334
300,7.8202
400,7.5305
500,7.2717
600,7.3832


CPU times: user 7min 54s, sys: 1.49 s, total: 7min 56s
Wall time: 7min 57s


TrainOutput(global_step=674, training_loss=7.937554243410377, metrics={'train_runtime': 477.2365, 'train_samples_per_second': 22.597, 'train_steps_per_second': 1.412, 'total_flos': 357297732452352.0, 'train_loss': 7.937554243410377, 'epoch': 1.0})

Save model:

In [22]:
trainer.save_model("../LawBERTarg/")