In [1]:
import re
import os
import math
import tqdm
import torch
import random
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
from collections import Counter
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from transformers.trainer_callback import EarlyStoppingCallback
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

warnings.filterwarnings('ignore')

In [2]:
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "TRUE"
os.environ["WANDB_DISABLED"] = "TRUE"

### Read NER Dataset

In [3]:
file_path = '../datasets/train.csv'
data = pd.read_csv(file_path)

### Clear Text

In [4]:
def Clear(text):
    text = text.replace('rt @user', '')
    text = text.replace('@user', '')
    pattern = re.compile('[^a-zA-Z0-9\sáéíóúàèìòùâêîôûãõçÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÃÕÇ]')
    text = re.sub(r'http\S+', '', text)
    text = pattern.sub(r' ', text)
    text = text.replace('\n', ' ')
    text = ' '.join(text.split())
    return text

In [5]:
data['text'] = data.text.apply(lambda x: Clear(x))

In [6]:
x = Counter(data.label).most_common()
m = x[0][1]/(x[0][1]+x[1][1])
print('Non-Toxic - {}% <-> Toxic - {}%'.format(round(m, 2), round(1-m, 2)))

Non-Toxic - 0.56% <-> Toxic - 0.44%


### Train and Validation

In [7]:
train, validation = train_test_split(data, test_size=0.1, random_state=0)
train = train.reset_index(drop=True)
validation = validation.reset_index(drop=True)

In [8]:
x = Counter(train.label).most_common()
m = x[0][1]/(x[0][1]+x[1][1])
print('Non-Toxic - {}% <-> Toxic - {}%'.format(round(m, 2), round(1-m, 2)))

Non-Toxic - 0.56% <-> Toxic - 0.44%


In [9]:
x = Counter(validation.label).most_common()
m = x[0][1]/(x[0][1]+x[1][1])
print('Non-Toxic - {}% <-> Toxic - {}%'.format(round(m, 2), round(1-m, 2)))

Non-Toxic - 0.57% <-> Toxic - 0.43%


### Create Datasets

In [10]:
train_dataset = Dataset.from_pandas(train)
validation_dataset = Dataset.from_pandas(validation)

In [11]:
datasets = DatasetDict()
datasets['train'] = train_dataset
datasets['validation'] = validation_dataset

### Tokenize Inputs

In [12]:
model_name = '../model-foundation/finetuned/'

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [15]:
def TokenizerFunc(input_):
    result = tokenizer(input_['text'], padding=True)
    return result

In [16]:
dataset_tokens = datasets.map(TokenizerFunc, batched=True, num_proc=4, remove_columns=['text'])

Map (num_proc=4):   0%|          | 0/15120 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1680 [00:00<?, ? examples/s]

### FineTune BERTimbau Model

In [17]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at ../model-foundation/finetuned/ were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../model-found

In [18]:
output_dir = '/finetune_metadata/checkpoints/'
logging_dir = '/finetune_metadata/logs/'
    
training_args = TrainingArguments(output_dir=output_dir,
                                  logging_dir=logging_dir,
                                  max_steps=1000,
                                  learning_rate=1e-5,
                                  weight_decay=0.01,
                                  adam_beta1 = 0.7,
                                  adam_beta2 = 0.999,
                                  adam_epsilon = 2e-08,
                                  max_grad_norm = 1.5,
                                  per_device_train_batch_size=2,
                                  per_device_eval_batch_size=2,
                                  logging_steps=50,
                                  warmup_steps=100,
                                  gradient_accumulation_steps=1,
                                  seed=42,
                                  metric_for_best_model='eval_loss',
                                  logging_strategy='steps',
                                  evaluation_strategy='steps',
                                  greater_is_better=False,
                                  do_train=True,
                                  do_eval=True,
                                  do_predict=True)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [19]:
data_collator = DataCollatorWithPadding(tokenizer)

In [20]:
def ComputeMetrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall}

In [21]:
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=dataset_tokens['train'],
                  eval_dataset=dataset_tokens['validation'],
                  compute_metrics=ComputeMetrics,
                  data_collator=data_collator)

In [22]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.6967,0.685183,0.553571,0.200426,0.425339,0.131102
100,0.6866,0.672272,0.6375,0.504475,0.605469,0.432357
150,0.643,0.641894,0.619643,0.221681,0.875,0.126918
200,0.6015,0.554062,0.746429,0.677761,0.740496,0.624826
250,0.5451,0.517344,0.75,0.66877,0.76951,0.591353
300,0.5535,0.492633,0.765476,0.70858,0.754331,0.668061
350,0.5913,0.505627,0.769643,0.737271,0.718254,0.757322
400,0.4758,0.536851,0.76131,0.754139,0.672867,0.857741
450,0.5505,0.493465,0.770833,0.752093,0.698565,0.814505
500,0.5094,0.589685,0.745238,0.758192,0.637227,0.935844


TrainOutput(global_step=1000, training_loss=0.5496741771697998, metrics={'train_runtime': 724.1919, 'train_samples_per_second': 5.523, 'train_steps_per_second': 1.381, 'total_flos': 324753042298560.0, 'train_loss': 0.5496741771697998, 'epoch': 0.26})

In [23]:
model_dir = './finetuned/'
trainer.save_model(model_dir)
tokenizer.save_pretrained(model_dir)

('./finetuned/tokenizer_config.json',
 './finetuned/special_tokens_map.json',
 './finetuned/vocab.txt',
 './finetuned/added_tokens.json',
 './finetuned/tokenizer.json')