In [1]:
import re
import os
import math
import warnings
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from huggingface_hub import notebook_login
from transformers import AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_metric
from transformers.trainer_callback import EarlyStoppingCallback
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask

warnings.filterwarnings('ignore')

In [2]:
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "TRUE"
os.environ["WANDB_DISABLED"] = "TRUE"

In [4]:
file_path = '../datasets/train.csv'
data = pd.read_csv(file_path)

In [5]:
data = data[['text']]

### Clear Text

In [6]:
def Clear(text):
    text = text.replace('rt @user', '')
    text = text.replace('@user', '')
    pattern = re.compile('[^a-zA-Z0-9\sáéíóúàèìòùâêîôûãõçÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÃÕÇ]')
    text = re.sub(r'http\S+', '', text)
    text = pattern.sub(r' ', text)
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())
    return text

In [7]:
data['text'] = data.text.apply(lambda x: Clear(x))

### Structure Data

In [8]:
train_dataset = Dataset.from_pandas(data)

In [9]:
datasets = DatasetDict()
datasets['train'] = train_dataset

### Tokenize

In [10]:
model_name = 'neuralmind/bert-base-portuguese-cased'

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [12]:
def TokenizerFunc(input_):
    return tokenizer(input_['text'])

In [13]:
dataset_tokens = datasets.map(TokenizerFunc, batched=True, num_proc=4, remove_columns=['text'])

Map (num_proc=4):   0%|          | 0/16800 [00:00<?, ? examples/s]

In [14]:
dataset_tokens

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16800
    })
})

### FineTune BERTimbau

In [15]:
model = AutoModelForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
output_dir = '/finetune_metadata/checkpoints/'
logging_dir = '/finetune_metadata/logs/'
    
training_args = TrainingArguments(output_dir=output_dir,
                                  logging_dir=logging_dir,
                                  num_train_epochs=20,
                                  learning_rate=6e-5,
                                  weight_decay=0.01,
                                  per_device_train_batch_size=32,
                                  per_device_eval_batch_size=32*2,
                                  logging_steps=100,
                                  warmup_steps=100,
                                  gradient_accumulation_steps=1,
                                  seed=42,
                                  metric_for_best_model='eval_loss',
                                  logging_strategy='steps',
                                  greater_is_better=False,
                                  do_train=True,
                                  do_eval=True,
                                  do_predict=True)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [17]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [18]:
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=dataset_tokens['train'],
                  data_collator=data_collator)

In [19]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,3.1341
200,2.7773
300,2.6526
400,2.6228
500,2.5555
600,2.4855
700,2.4571
800,2.4688
900,2.3973
1000,2.3512


TrainOutput(global_step=5260, training_loss=2.086100287491831, metrics={'train_runtime': 2039.937, 'train_samples_per_second': 164.711, 'train_steps_per_second': 2.579, 'total_flos': 1.4292360502159872e+16, 'train_loss': 2.086100287491831, 'epoch': 20.0})

### Save Model

In [20]:
model_dir = './finetuned/'
trainer.save_model(model_dir)
tokenizer.save_pretrained(model_dir)

('./finetuned/tokenizer_config.json',
 './finetuned/special_tokens_map.json',
 './finetuned/vocab.txt',
 './finetuned/added_tokens.json',
 './finetuned/tokenizer.json')