# 1- Desenho do processo: classificação multi rótulos usando BERT português

## Desafios

- Formatar os dados como `datasets` do HuggingFace
- Caso necessário após o treino multilabel, entender como pegar as 2 
classes mais prováveis, ou seja, as probabilidades de cada uma das classes.
    - resolvido: facil pegar da sigmoide

In [1]:
import torch
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EvalPrediction,
)

from datasets import load_dataset, DatasetDict, Dataset
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

In [2]:
csv_path = "../data/phrases/dataset.csv"

load_dataset(
    "csv",
    data_files=csv_path,
    split="train",
    delimiter=",",
    column_names=["text", "label"],
)

Dataset({
    features: ['text', 'label'],
    num_rows: 522
})

Observando acima, não posso carregar diretamente, preciso criar
as features one hot encoded.

In [3]:
## Trabalhando no pandinhas

df_raw = pd.read_csv(csv_path)

df_raw['ID'] = df_raw.index
df_raw = df_raw.rename(columns={'text': 'sentence', 'category': 'labels'})
df_raw['label_1'] = df_raw['labels'].str.split(',').str[0]
df_raw['label_2'] = df_raw['labels'].str.split(',').str[1]
# one hot encode
one_hot_label1 = pd.get_dummies(df_raw['label_1'])
one_hot_label2 = pd.get_dummies(df_raw['label_2'])

# mix both
one_hot_labels = one_hot_label1.add(one_hot_label2, fill_value=0).astype(int)
one_hot_labels

df_raw = pd.concat([df_raw, one_hot_labels], axis=1)
df_raw


Unnamed: 0,sentence,labels,ID,label_1,label_2,educação,finanças,indústrias,orgão público,varejo
0,"Auxílio-Doença Previdenciário, Benefícios em E...",orgão público,0,orgão público,,0,0,0,1,0
1,"PAGAR TODAS AS CONTAS EM ATRASO R$1.290,90.",finanças,1,finanças,,0,1,0,0,0
2,Então encontraremos na próxima aula.,educação,2,educação,,1,0,0,0,0
3,Veja os resultados da categoria de ofertas do ...,indústrias,3,indústrias,,0,0,1,0,0
4,"Além disso, a embalagem é reutilizável e 100% ...","indústrias,varejo",4,indústrias,varejo,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...
516,"Selecione o local de estudo, curso sem encontr...",educação,516,educação,,1,0,0,0,0
517,ESTUDANTES DA REDE MUNICIPAL VOLTAM ÀS AULAS E...,"educação,orgão público",517,educação,orgão público,1,0,0,1,0
518,Empresas e órgãos públicos,orgão público,518,orgão público,,0,0,0,1,0
519,DGE – Departamento de Gestão Estratégica Metas...,orgão público,519,orgão público,,0,0,0,1,0


In [4]:
# mantem só as colunas de interesse
df_raw = df_raw[[
    'ID',
    'sentence',
    'educação',
    'finanças',
    'indústrias',
    'orgão público',
    'varejo',
]]

In [5]:
df_raw

Unnamed: 0,ID,sentence,educação,finanças,indústrias,orgão público,varejo
0,0,"Auxílio-Doença Previdenciário, Benefícios em E...",0,0,0,1,0
1,1,"PAGAR TODAS AS CONTAS EM ATRASO R$1.290,90.",0,1,0,0,0
2,2,Então encontraremos na próxima aula.,1,0,0,0,0
3,3,Veja os resultados da categoria de ofertas do ...,0,0,1,0,0
4,4,"Além disso, a embalagem é reutilizável e 100% ...",0,0,1,0,1
...,...,...,...,...,...,...,...
516,516,"Selecione o local de estudo, curso sem encontr...",1,0,0,0,0
517,517,ESTUDANTES DA REDE MUNICIPAL VOLTAM ÀS AULAS E...,1,0,0,1,0
518,518,Empresas e órgãos públicos,0,0,0,1,0
519,519,DGE – Departamento de Gestão Estratégica Metas...,0,0,0,1,0


Split: train/test/validation

In [6]:
# 75% treino
# 10% validação
# 15% teste


df_train = df_raw.sample(frac=0.65, random_state=0)
df_test = df_raw.drop(df_train.index) # holdout
df_val = df_test.sample(frac=0.40, random_state=0)
df_test = df_test.drop(df_val.index)

print(f"Shapes: train {df_train.shape}, val {df_val.shape}, test {df_test.shape}")

Shapes: train (339, 7), val (73, 7), test (109, 7)


In [7]:
df_train.to_csv('../data/phrases/train.csv', index=False)

In [8]:
dataset = load_dataset("csv", data_files="../data/phrases/train.csv")
dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ID', 'sentence', 'educação', 'finanças', 'indústrias', 'orgão público', 'varejo'],
        num_rows: 339
    })
})

In [9]:
datasets_to_load = {
    "train": df_train,
    "test": df_test,
    "validation": df_val,
}

dataset = DatasetDict()

for ds_name, ds in datasets_to_load.items():
    # macetinho: remove index_level_0
    ds = ds.reset_index(drop=True)
    dataset[ds_name] = Dataset.from_pandas(ds)

dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'sentence', 'educação', 'finanças', 'indústrias', 'orgão público', 'varejo'],
        num_rows: 339
    })
    test: Dataset({
        features: ['ID', 'sentence', 'educação', 'finanças', 'indústrias', 'orgão público', 'varejo'],
        num_rows: 109
    })
    validation: Dataset({
        features: ['ID', 'sentence', 'educação', 'finanças', 'indústrias', 'orgão público', 'varejo'],
        num_rows: 73
    })
})

Dados prontos!

## Cria auxiliares do modelo

In [10]:
dataset['train']

Dataset({
    features: ['ID', 'sentence', 'educação', 'finanças', 'indústrias', 'orgão público', 'varejo'],
    num_rows: 339
})

In [11]:
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'sentence']]
labels

id2label = {idx:label for idx, label in enumerate(labels)}
id2label

label2id = {label:idx for idx, label in enumerate(labels)}
label2id

{'educação': 0,
 'finanças': 1,
 'indústrias': 2,
 'orgão público': 3,
 'varejo': 4}

## Preprocessamento

In [12]:
# model: portuguese bert
model_name = "neuralmind/bert-base-portuguese-cased"

from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [13]:
def preprocess_data(examples, text_colname="sentence"):
  # take a batch of texts
  text = examples[text_colname]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=32)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

In [14]:
encoded_dataset = dataset.map(
    preprocess_data,
    batched=True,
    remove_columns=dataset['train'].column_names
)

Map:   0%|          | 0/339 [00:00<?, ? examples/s]

Map:   0%|          | 0/109 [00:00<?, ? examples/s]

Map:   0%|          | 0/73 [00:00<?, ? examples/s]

In [15]:
encoded_dataset.data['train']

InMemoryTable
input_ids: list<item: int32>
  child 0, item: int32
token_type_ids: list<item: int8>
  child 0, item: int8
attention_mask: list<item: int8>
  child 0, item: int8
labels: list<item: double>
  child 0, item: double
----
input_ids: [[[101,2511,6769,131,10814,...,0,0,0,0,0],[101,2296,1017,21245,125,...,0,0,0,0,0],...,[101,3928,4352,117,1833,...,22320,14298,22357,21748,102],[101,14128,3549,22323,118,...,0,0,0,0,0]]]
token_type_ids: [[[0,0,0,0,0,...,0,0,0,0,0],[0,0,0,0,0,...,0,0,0,0,0],...,[0,0,0,0,0,...,0,0,0,0,0],[0,0,0,0,0,...,0,0,0,0,0]]]
attention_mask: [[[1,1,1,1,1,...,0,0,0,0,0],[1,1,1,1,1,...,0,0,0,0,0],...,[1,1,1,1,1,...,1,1,1,1,1],[1,1,1,1,1,...,0,0,0,0,0]]]
labels: [[[0,0,0,0,1],[0,0,0,1,0],...,[0,0,0,1,0],[1,0,1,0,0]]]

In [16]:
example = encoded_dataset['train'][0]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [17]:
tokenizer.decode(example['input_ids'])

'[CLS] Mais vendido : Ecolapis de Cor 12 Cores + 2 Bicolor. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [18]:
example['labels']
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['varejo']

In [19]:
encoded_dataset.set_format("torch")

## Definição do modelo

In [20]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    problem_type="multi_label_classification",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Treino

In [21]:
from transformers import TrainingArguments, Trainer

batch_size = 30
metric_name = "f1"
epochs = 30

args = TrainingArguments(
    output_dir = f"portuguese_pass_01",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
    # TODO: add logging dir to check with tensorboard
    logging_dir='./logs_portuguese',
)

## Métricas de avaliação

In [22]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

### Check: um unico passo

In [23]:
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [24]:
encoded_dataset['train']['input_ids'][0]

tensor([  101,  2511,  6769,   131, 10814,   715, 17573,   125,  1553,  1242,
        19483, 22281,   116,   245, 10912, 18744, 22282,   119,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0])

In [25]:
#forward pass
outputs = model(
    input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0),
    labels=encoded_dataset['train'][0]['labels'].unsqueeze(0)
)

outputs

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=tensor(0.6760, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.1999,  0.2592, -0.0170, -0.1233,  0.1250]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [26]:
import os

In [27]:
import tqdm

# clear tqdm
from tqdm import tqdm




In [None]:
from transformers import TrainingArguments, Trainer

batch_size = 10
metric_name = "f1"
epochs = 10
run_id = '06'

args = TrainingArguments(
    output_dir = f"portuguese_pass_{run_id}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
    # TODO: add logging dir to check with tensorboard
    logging_dir=f'./logs_portuguese_{run_id}',
)

# gives full access to logging_dir and output_dir using os
os.makedirs(args.output_dir, exist_ok=True)
os.makedirs(args.logging_dir, exist_ok=True)
os.chmod(args.output_dir, 0o777)
os.chmod(args.logging_dir, 0o777)
os.chmod("./", 0o777)
tqdm._instances.clear()

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

## Eval

In [29]:
trainer.evaluate()

  0%|          | 0/8 [00:00<?, ?it/s]

{'eval_loss': 0.14765533804893494, 'eval_f1': 0.9019607843137255, 'eval_roc_auc': 0.9401065379712257, 'eval_accuracy': 0.863013698630137, 'eval_runtime': 0.2758, 'eval_samples_per_second': 264.64, 'eval_steps_per_second': 29.002, 'epoch': 6.0}


{'eval_loss': 0.14765533804893494,
 'eval_f1': 0.9019607843137255,
 'eval_roc_auc': 0.9401065379712257,
 'eval_accuracy': 0.863013698630137,
 'eval_runtime': 0.2758,
 'eval_samples_per_second': 264.64,
 'eval_steps_per_second': 29.002,
 'epoch': 6.0}

In [30]:
def infer_text(text_input, model, threshold=0.5):
    encoding = tokenizer(text_input, return_tensors="pt")
    encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
    outputs = trainer.model(**encoding)
    logits = outputs.logits
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu())
    predictions = np.zeros(probs.shape)
    predictions[np.where(probs >= threshold)] = 1
    predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
    print(predicted_labels)

In [31]:
infer_text("O Brasil é um dos maiores países do mundo, menos quando se trata da educação.", model=model, threshold=0.7)

['educação']


In [32]:
infer_text("Novo Nissan Versa, um sedan com o melhor da tecnologia japonesa.", model=model, threshold=0.5)

['indústrias']


In [33]:
infer_text(
    "O governo de São Paulo anuncia que a vacinação para pessoas com 67 anos começa na próxima semana.",
    model=model,
    threshold=0.7
)

['orgão público']


In [34]:
# save model
save_id = f"portuguese_pass_saved_model{run_id}"

In [35]:
model.save_pretrained(save_id)

In [36]:
# reload model
new_model = AutoModelForSequenceClassification.from_pretrained(
    save_id,
    problem_type="multi_label_classification",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

In [49]:
infer_text(
    "A brutalidade policial é consequência direta da falta de estudos",
    model=new_model,
    threshold=0.5
)

['educação', 'orgão público']


Estamos prontos para usar.