In [1]:
import numpy
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import RobertaForSequenceClassification
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader

In [2]:
#carregando o dataset go_emotions do HuggingFace
dataset = load_dataset('go_emotions')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
dataset.keys()

dict_keys(['train', 'validation', 'test'])

In [4]:
dataset['train'][0], len(dataset['train'])

({'text': "My favourite food is anything I didn't have to cook myself.",
  'labels': [27],
  'id': 'eebbqej'},
 43410)

In [5]:
print(f'numero de emoçoes: {len(dataset["train"].features["labels"].feature.names)}\n')
print(f'emoções: {dataset["train"].features["labels"].feature.names}')

numero de emoçoes: 28

emoções: ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [6]:
#instaciando o tokenizer do modelo e criando uma função para tokenizar os dados em batches
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

def tokenizer_batch(batch):
  return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128)

In [None]:
#tokenizando os textos dos três conjuntos
dataset['train'] = dataset['train'].map(tokenizer_batch, batched=True)
dataset['validation'] = dataset['validation'].map(tokenizer_batch, batched=True)
dataset['test'] = dataset['test'].map(tokenizer_batch, batched=True)

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

In [8]:
#verificando as colunas que o tokenizer criou
dataset['train'].column_names

['text', 'labels', 'id', 'input_ids', 'attention_mask']

In [None]:
#formatando coluna target para terem todas amostras terem o mesmo tamanho
def formatar_labels(batch):
    num_labels = 28
    novas_labels = []

    for labels in batch["labels"]:
        vetor = torch.zeros(num_labels)
        for label in labels:
            vetor[label] = 1
        novas_labels.append(vetor)

    batch["labels"] = novas_labels
    return batch


dataset = dataset.map(formatar_labels, batched=True)


Map:   0%|          | 0/43410 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

In [10]:
#passando os dados do dataset para tensores
colunas_modelo = ["input_ids", "attention_mask", "labels"]

dataset['train'].set_format("torch", columns=colunas_modelo)
dataset['validation'].set_format("torch", columns=colunas_modelo)
dataset['test'].set_format("torch", columns=colunas_modelo)

In [11]:
#defindo os batches
batch_treino = DataLoader(dataset["train"], batch_size=16, shuffle=True)
batch_validacao = DataLoader(dataset["validation"], batch_size=16)

In [12]:
#treinamento do modelo
modelo = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=28, problem_type="multi_label_classification")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#passando o modelo para rodar na GPU, pois vamos usar uma GPU T4 para o fine-tuning
gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")
modelo.to(gpu)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [14]:
#instaciando o otimizador
otimizador = AdamW(modelo.parameters(), lr=2e-5)

In [None]:
#treinamento do modelo: 4 epocas com batch size de 16, ou seja, a cada 16 amostras o modelo atualiza os pesos.
epocas = 4

for epoca in range(epocas):
    modelo.train()
    total_loss = 0

    for batch in batch_treino:
        ids = batch["input_ids"].to(gpu)
        mask = batch["attention_mask"].to(gpu)
        labels = batch["labels"].to(gpu).float()

        output = modelo(input_ids=ids, attention_mask=mask, labels=labels)
        loss = output.loss

        total_loss += loss.item()

        loss.backward()
        otimizador.step()
        otimizador.zero_grad()

    print(f"Época {epoca+1}/{epocas} | Perda média: {total_loss / len(batch_treino):.4f}")

Época 1/4 | Perda média: 0.1193
Época 2/4 | Perda média: 0.0851
Época 3/4 | Perda média: 0.0758
Época 4/4 | Perda média: 0.0676


In [None]:
modelo.save_pretrained("/content/modelo/matheus-roberta-goemotions")