In [26]:
from IPython.display import clear_output

import evaluate
from datasets import load_dataset
import numpy as np
import pandas as pd
import tensorflow as tf
from torchview import draw_graph
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline, TrainingArguments, Trainer, BertForSequenceClassification

import os
import time

np.random.seed(10)

In [25]:
datasets_path = 'etiquetados'
results_path = 'results'
hf_train_path = os.path.join(datasets_path, 'hf_unified_dataset_training.csv')
hf_test_path = os.path.join(datasets_path, 'hf_unified_dataset_testing.csv')
prediction_path = os.path.join(datasets_path, 'prediction_dataset.csv')

In [3]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1) 

    # Calculate accuracy
    accuracy = acc_metric.compute(predictions=predictions, references=labels)["accuracy"]

    # Calculate precision, recall, and F1-score
    precision = precision_metric.compute(predictions=predictions, references=labels)["precision"]
    recall = recall_metric.compute(predictions=predictions, references=labels)["recall"]
    f1 = f1_metric.compute(predictions=predictions, references=labels)["f1"]
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [4]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=30)

In [5]:
pred_mapper = {
    0: "Evento não relevante de trânsito",
    1: "Evento relevante de trânsito"
}

## Experimentos com BERTimbau

In [6]:
HEADER_bertimbau = '''\
round,\
train_time (s),\
accuracy,\
precision,\
recall,\
f1\

'''

In [7]:
# with open(os.path.join(results_path, 'bertimbau_10_rounds.csv'), 'w') as file:
#     file.write(HEADER_bertimbau)
#     file.write('\n')

### Download do modelo, tokenizador e métricas

In [8]:
model = AutoModelForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', num_labels=2)
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
dataset = load_dataset("csv", data_files={"train": hf_train_path, "test": hf_test_path})

In [10]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [11]:
acc_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

### Desempenho de classificação pré-treino

In [12]:
bertimbau = BertForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
tokens_bertimbau = tokenizer(["Teste", "Hoje a bolsa subiu"], return_tensors="pt",
                    padding=True, truncation=True, max_length=30)
bertimbau_outputs = bertimbau(**tokens_bertimbau)

In [14]:
preds_bertimbau = [pred_mapper[np.argmax(pred)] for pred in bertimbau_outputs.logits.cpu().detach().numpy()]
display(preds_bertimbau)

['Evento não relevante de trânsito', 'Evento não relevante de trânsito']

## Primeiro experimento

In [15]:
training_args = TrainingArguments(output_dir=results_path, evaluation_strategy="epoch", save_strategy="no", use_cpu=True)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
trainer.save_model(os.path.join(results_path, 'models', 'first_run_model'))

### Avaliando modelo obtido com o 1º treinamento

In [None]:
tweetbertptbr = BertForSequenceClassification.from_pretrained(os.path.join(results_path, 'models', 'first_run_model'))

In [None]:
tokens = tokenizer(["Estou preso no trânsito", "Hoje a bolsa subiu"], return_tensors="pt",
                    padding=True, truncation=True, max_length=30)
tweetbertptbr_outputs = tweetbertptbr(**tokens)

In [None]:
preds = [pred_mapper[np.argmax(pred)] for pred in tweetbertptbr_outputs.logits.cpu().detach().numpy()]
display(preds)

## Segundo experimento

In [19]:
training_args_2 = TrainingArguments(output_dir=results_path, evaluation_strategy="epoch", save_strategy="no", learning_rate=0.00002, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=3, use_cpu=True)

In [20]:
trainer_2 = Trainer(
    model=model,
    args=training_args_2,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

In [21]:
# total_flos: The total number of floating operations done by the model since the beginning of training.
trainer_2.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0036,0.057888,0.992341,0.992912,0.990659,0.991784
2,0.0043,0.070903,0.992341,0.993572,0.989992,0.991779
3,0.0094,0.067505,0.992154,0.991331,0.99186,0.991596


TrainOutput(global_step=9036, training_loss=0.005026228805684829, metrics={'train_runtime': 15055.1741, 'train_samples_per_second': 9.6, 'train_steps_per_second': 0.6, 'total_flos': 2228185777865400.0, 'train_loss': 0.005026228805684829, 'epoch': 3.0})

In [22]:
trainer_2.save_model(os.path.join(results_path, 'models', 'second_run_model'))

## Experimento com 10 rodadas

In [15]:
training_args_10 = TrainingArguments(output_dir=results_path, evaluation_strategy="epoch", save_strategy="no", learning_rate=0.00002, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=3, use_cpu=True)

In [16]:
trainer_10 = Trainer(
    model=model,
    args=training_args_10,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

In [17]:
def bertimbau_10_rounds(rounds):
    global executed_rounds
    for bertimbau_instance in range(rounds):
        output = trainer_10.train()
        resultados = trainer_10.evaluate()
        bertimbau_line_results = [
            str(bertimbau_instance),
            str(output.metrics['train_runtime']),
            str(resultados['eval_accuracy']),
            str(resultados['eval_precision']),
            str(resultados['eval_recall']),
            str(resultados['eval_f1']),
        ]
        with open(os.path.join(results_path, 'bertimbau_10_rounds.csv'), 'a') as file:
            file.write(','.join(bertimbau_line_results))
            file.write('\n')
        executed_rounds = executed_rounds + 1
        clear_output(wait=True)

In [None]:
executed_rounds = 0
try:
    bertimbau_10_rounds(10)
except Exception:
    bertimbau_10_rounds(10 - executed_rounds)

## Cálculo da média do tempo de predição

In [27]:
pred_dataset = pd.read_csv(prediction_path)

In [46]:
secondmodel_bertimbau = BertForSequenceClassification.from_pretrained(os.path.join(results_path, 'models', 'second_run_model'))
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

In [None]:
# tempo médio de inferência usando a GPU fica muito menor. De 0.034 vai para 0.0065.
# secondmodel_bertimbau = secondmodel_bertimbau.to("cuda:0")
# tokens_prediction = tokenizer(pred_dataset['text'].to_list()[iter], return_tensors="pt",
#                 padding=True, truncation=True, max_length=30).to("cuda:0")

In [47]:
media_tempo_predicao = 0
for iter in range(100):
    tokens_prediction = tokenizer(pred_dataset['text'].to_list()[iter], return_tensors="pt",
                    padding=True, truncation=True, max_length=30)
    start_time = time.perf_counter()
    tweetbertptbr_pred_outputs = secondmodel_bertimbau(**tokens_prediction)
    prediction_time = (time.perf_counter() - start_time)
    media_tempo_predicao = media_tempo_predicao + prediction_time
media_tempo_predicao = media_tempo_predicao / 100
print(media_tempo_predicao)

0.033935779998719225


### Representação Gráfica do Modelo

In [None]:
model_graph = draw_graph(model, input_data=inputs, device='cpu')
model_graph.visual_graph