# Pre-training

In [1]:
import os

from accelerate import accelerator

import torch # for cuda usage

import transformers
from transformers import RobertaConfig, RobertaTokenizer, RobertaForMaskedLM
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

from datasets import load_dataset

### Configuring model


We will pretrain a RoBERTa-style transformer model, configured with the same number of layers and attention heads as a DistilBERT model. The model will feature a vocabulary size of 52,000, 12 attention heads, and 6 layers.

In [2]:
config = RobertaConfig(vocab_size=30000,
                       max_position_embeddings=514,
                       num_attention_heads=12,
                       num_hidden_layers=12,
                       type_vocab_size=1,
                       )
print(config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 30000
}



### Tokenizer

Initialize tokenizer from the one I trained:

In [3]:
tokenizer = RobertaTokenizer.from_pretrained('../LawBERTarg', max_len=512)

In [4]:
print(tokenizer)

RobertaTokenizer(name_or_path='../LawBERTarg', vocab_size=20341, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}
)


In [5]:
print(tokenizer.vocab_size)

20341


Test it:

In [6]:
print(tokenizer.tokenize("es un órgano independiente instituido en el ámbito del Congreso de la"))

['es', 'Ġun', 'ĠÃ³rgano', 'Ġindependiente', 'Ġinstituido', 'Ġen', 'Ġel', 'ĠÃ¡mbito', 'Ġde', 'l', 'ĠCongreso', 'Ġde', 'Ġla']


Initialize model:

In [7]:
model = RobertaForMaskedLM(config=config)

In [8]:
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNor

In [9]:
print(model.num_parameters())

109113648


### Loading data for processing

Loading dataset:

In [10]:
file_names = []

if os.path.exists('../data/') and os.path.isdir('../data/'):
    file_names = os.listdir('../data/')

file_names = ['../data/'+file_name for file_name in file_names]

file_names

['../data/codigo_penal.txt',
 '../data/codigo_de_etica_publica.txt',
 '../data/codigo_procesal_penal_federal.txt',
 '../data/codigo_aeronautico.txt',
 '../data/codigo_procesal_civil_y_comercial.txt',
 '../data/codigo_aduanero.txt',
 '../data/constitucion-caba.txt',
 '../data/codigo_electoral_nacional.txt',
 '../data/codigo_civil_y_comercial.txt',
 '../data/codigo_alimentario_argentino.txt',
 '../data/constitucion-nacional.txt',
 '../data/codigo_de_mineria.txt']

Load the dataset to generate samples for batch training, with block_size=256 limiting the length of an example:

In [11]:
"""dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=combined_file,
    block_size=64
)"""

dataset = load_dataset(path="text",
                       data_files=file_names,
                       )

Generating train split: 0 examples [00:00, ? examples/s]

In [12]:
def tokenize_function(examples):
    return tokenizer(examples["text"], # select the text lines from dataset
                     truncation=True,
                     padding="max_length",
                     max_length=64)

# map tokenize function to loaded dataset
tokenized_dataset = dataset.map(tokenize_function,
                                batched=True,
                                remove_columns=["text"])
# now this tokenized dataset cas columns: input_ids, attention_masks, among other fields (text was removed)

Map:   0%|          | 0/34297 [00:00<?, ? examples/s]

Create a data collator, that will collate samples from the dataset to prepare batch processing.

Prepare for Masked Language Modeling by setting mlm=True and specifying a masking probability of 0.15, meaning 15% of tokens will be masked during pretraining.

In [13]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.20
)

### Training

Initialize trainer:

In [14]:
# instance of TrainingArguments that stores hyperparams
training_args = TrainingArguments(
    output_dir='../LawBERTarg/',
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=16, # batch_size, low due to gpu memory restriction
    save_steps=10000, # checkpoint saved every 10k steps
    save_total_limit=3,
    prediction_loss_only=True,
)

In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Using device: {device}")

Using device: cuda


In [16]:
trainer = Trainer(
    model=model, # the RoBERTa we initialized
    args=training_args,
    data_collator=data_collator,
    #train_dataset=dataset,
    train_dataset=tokenized_dataset["train"],
)

Start pre-training:

In [17]:
%%time
trainer.train()

Step,Training Loss
500,7.3532
1000,6.7111
1500,6.4833
2000,6.288
2500,6.0778
3000,5.9755
3500,5.8185
4000,5.8234


CPU times: user 47min 38s, sys: 4.07 s, total: 47min 42s
Wall time: 47min 42s


TrainOutput(global_step=4288, training_loss=6.2748199861441085, metrics={'train_runtime': 2862.6868, 'train_samples_per_second': 23.961, 'train_steps_per_second': 1.498, 'total_flos': 2256770116620288.0, 'train_loss': 6.2748199861441085, 'epoch': 2.0})

Save model:

In [18]:
trainer.save_model("../LawBERTarg/")