# Pre-training

In [21]:
import os

from accelerate import accelerator

import torch # for cuda usage

import transformers
from transformers import RobertaConfig, RobertaTokenizer, RobertaForMaskedLM
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

from datasets import load_dataset

### Configuring model


We will pretrain a RoBERTa-style transformer model, configured with the same number of layers and attention heads as a DistilBERT model. The model will feature a vocabulary size of 52,000, 12 attention heads, and 6 layers.

In [22]:
config = RobertaConfig(vocab_size=30000,
                       max_position_embeddings=514,
                       num_attention_heads=12,
                       num_hidden_layers=6,
                       type_vocab_size=1,
                       )
print(config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 30000
}



### Tokenizer

Initialize tokenizer from the one I trained:

In [23]:
tokenizer = RobertaTokenizer.from_pretrained('../LawBERTarg', max_len=512)

In [24]:
print(tokenizer)

RobertaTokenizer(name_or_path='../LawBERTarg', vocab_size=20397, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}
)


In [25]:
print(tokenizer.vocab_size)

20397


Test it:

In [26]:
print(tokenizer.tokenize("es un órgano independiente instituido en el ámbito del Congreso de la"))
print(tokenizer.encode("es un órgano independiente instituido en el ámbito del Congreso de la"))

['es', 'Ġun', 'ĠÃ³rgano', 'Ġindependiente', 'Ġins', 'tituido', 'Ġen', 'Ġel', 'ĠÃ¡mbito', 'Ġdel', 'ĠCongreso', 'Ġde', 'Ġla']
[2, 267, 362, 4228, 5660, 818, 998, 294, 286, 3596, 320, 2251, 262, 274, 3]


Initialize model:

In [27]:
model = RobertaForMaskedLM(config=config)

In [28]:
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm)

In [29]:
print(model.num_parameters())

66586416


### Loading data for processing

Loading dataset:

In [30]:
file_names = []

if os.path.exists('../data/') and os.path.isdir('../data/'):
    file_names = os.listdir('../data/')

file_names = ['../data/'+file_name for file_name in file_names]

file_names

['../data/codigo_penal.txt',
 '../data/codigo_de_etica_publica.txt',
 '../data/codigo_procesal_penal_federal.txt',
 '../data/codigo_aeronautico.txt',
 '../data/codigo_procesal_civil_y_comercial.txt',
 '../data/codigo_aduanero.txt',
 '../data/codigo_electoral_nacional.txt',
 '../data/codigo_civil_y_comercial.txt',
 '../data/codigo_alimentario_argentino.txt',
 '../data/constitucion-nacional.txt',
 '../data/codigo_de_mineria.txt']

In [31]:
combined_file = "combined.txt"

with open(combined_file, "w") as outfile:
    for fname in file_names:
        with open(fname) as infile:
            outfile.write(infile.read() + "\n") # newline between files

Load the dataset to generate samples for batch training, with block_size=256 limiting the length of an example:

In [32]:
"""dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=combined_file,
    block_size=64
)"""

dataset = load_dataset(path="text",
                       data_files=file_names,
                       )

In [33]:
def tokenize_function(examples):
    return tokenizer(examples["text"], # select the text lines from dataset
                     truncation=True,
                     padding="max_length",
                     max_length=64)

# map tokenize function to loaded dataset
tokenized_dataset = dataset.map(tokenize_function,
                                batched=True,
                                remove_columns=["text"])
# now this tokenized dataset cas columns: input_ids, attention_masks, among other fields (text was removed)

Create a data collator, that will collate samples from the dataset to prepare batch processing.

Prepare for Masked Language Modeling by setting mlm=True and specifying a masking probability of 0.15, meaning 15% of tokens will be masked during pretraining.

In [34]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

### Training

Initialize trainer:

In [35]:
# instance of TrainingArguments that stores hyperparams
training_args = TrainingArguments(
    output_dir='../LawBERTarg/',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16, # batch_size, low due to gpu memory restriction
    save_steps=5000, # checkpoint saved every 10k steps
    save_total_limit=4,
    prediction_loss_only=True,
)

In [36]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Using device: {device}")

Using device: cuda


In [37]:
trainer = Trainer(
    model=model, # the RoBERTa we initialized
    args=training_args,
    data_collator=data_collator,
    #train_dataset=dataset,
    train_dataset=tokenized_dataset["train"],
)

Start pre-training:

In [38]:
%%time
trainer.train()

Step,Training Loss
500,7.2941
1000,6.5111
1500,6.2075
2000,6.0784
2500,5.919
3000,5.8532
3500,5.7451
4000,5.7726
4500,5.6389
5000,5.5895


CPU times: user 42min 39s, sys: 4.75 s, total: 42min 44s
Wall time: 42min 45s


TrainOutput(global_step=6162, training_loss=5.967783966609233, metrics={'train_runtime': 2564.8615, 'train_samples_per_second': 38.423, 'train_steps_per_second': 2.402, 'total_flos': 1632967987507200.0, 'train_loss': 5.967783966609233, 'epoch': 3.0})

Save model:

In [39]:
trainer.save_model("../LawBERTarg/")