In [1]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import pickle

import pandas as pd

import numpy as np

import evaluate

import pyarrow as pa
import pyarrow.dataset as ds
from datasets import Dataset

from torch.nn import CrossEntropyLoss
import torch

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# Abrindo o objeto dataframe
with open(r'df-pre-processado-cloud2.pickle', 'rb') as pickledfile:
    df = pickle.load(pickledfile)

In [3]:
# Abrindo o objeto dataframe
with open(r'df-tokenizado.pickle', 'rb') as pickledfile:
    df2 = pickle.load(pickledfile)

In [4]:
# Testando o mapping
list(set([(x,y) for x,y in zip(df['label'], df2['Assunto'])]))

[(4, 'Direito do Trabalho'),
 (1, 'Direito Penal'),
 (2, 'Direito Previdenciário'),
 (3, 'Direito Tributário'),
 (0, 'Direito Civil')]

In [5]:
# Abrindo o objeto dataframe
with open(r'lista-splits.pickle', 'rb') as pickledfile:
    lista_splits = pickle.load(pickledfile)

In [6]:
metric = evaluate.load("accuracy")

In [7]:
def compute_metrics(eval_pred):

    logits, labels = eval_pred

    predictions = np.argmax(logits, axis=-1)

    return metric.compute(predictions=predictions, references=labels)

In [8]:
def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # Calculate and scale weighting
    weights = torch.stack([2,1,1,1,1])
    weights = alpha * (1.0 + weights)
    # Calculate weighted average
    weighted_loss = weights * (loss_per_sample).mean()
    return weighted_loss

In [9]:
class MyTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        input_ids = inputs.get("input_ids")
        outputs = model(input_ids)
        loss = keytoken_weighted_loss(input_ids, outputs.logits, keytoken_ids)

        return (loss, outputs) if return_outputs else loss

In [10]:
bertmodel = AutoModelForSequenceClassification.from_pretrained('neuralmind/bert-large-portuguese-cased', num_labels = 5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-large-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
training_args = TrainingArguments(output_dir="treino/split1", eval_strategy="epoch", use_cpu=False, no_cuda=False)

t_ = lista_splits[0][1]
v_ = lista_splits[0][2]

df_teste = pd.DataFrame.from_dict(
    {'input_ids': df.loc[t_, 'input_ids'], 'label': df.loc[t_, 'label'], 'attention_mask':df.loc[t_, 'attention_mask'] }
)
hg_dataset_teste = Dataset(pa.Table.from_pandas(df_teste))

df_val = pd.DataFrame.from_dict(
    {'input_ids': df.loc[v_, 'input_ids'], 'label': df.loc[v_, 'label'], 'attention_mask':df.loc[v_, 'attention_mask'] }
)
hg_dataset_val = Dataset(pa.Table.from_pandas(df_val))

trainer = MyTrainer(
    model=bertmodel,
    args=training_args,
    train_dataset=hg_dataset_teste,
    eval_dataset=hg_dataset_val,
    compute_metrics=compute_metrics
)

In [12]:
trainer.train()

  0%|          | 0/25527 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
