In [1]:
! pip install transformers datasets evaluate seqeval
! pip install -U accelerate
! pip install -U transformers



Importando bibliotecas

In [3]:
import numpy as np
import os 
import pandas as pd
import datasets
import torch
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
import evaluate
from transformers import AutoModelForTokenClassification
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

Leitura do dataset

In [4]:
dataset = datasets.load_from_disk("../dataset")

Informação do dataset

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 7828
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1177
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1390
    })
})

In [7]:
print(dataset['train'].description)


LeNER-Br is a Portuguese language dataset for named entity recognition
applied to legal documents. LeNER-Br consists entirely of manually annotated
legislation and legal cases texts and contains tags for persons, locations,
time entities, organizations, legislation and legal cases.
To compose the dataset, 66 legal documents from several Brazilian Courts were
collected. Courts of superior and state levels were considered, such as Supremo
Tribunal Federal, Superior Tribunal de Justiça, Tribunal de Justiça de Minas
Gerais and Tribunal de Contas da União. In addition, four legislation documents
were collected, such as "Lei Maria da Penha", giving a total of 70 documents



Convertendo para pandas dataframe

In [8]:
df_train = pd.DataFrame(dataset['train'])
df_validation = pd.DataFrame(dataset['validation'])
df_test = pd.DataFrame(dataset['test'])

In [9]:
df_train[df_train['tokens'].apply(lambda x: len(x) == 0)]

Unnamed: 0,id,tokens,ner_tags
7827,7827,[],[]


Removendo linhas que não deveriam estar no dataset

In [10]:
df_train = df_train[df_train['tokens'].apply(lambda x: len(x) > 0)]
df_validation = df_validation[df_validation['tokens'].apply(lambda x: len(x) > 0)]
df_test = df_test[df_test['tokens'].apply(lambda x: len(x) > 0)]

Utilizando tokenizer BERTimbau base portuguese cased. https://huggingface.co/neuralmind/bert-base-portuguese-cased

In [11]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

Comparando tokenização com separação do dataset

In [12]:
example = dataset['train'][1]
tokenized_input = tokenizer(example['tokens'], is_split_into_words = True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'])
word_ids = tokenized_input.word_ids()

In [13]:
len(example['tokens']), len(tokens)

(55, 67)

In [14]:
np.array(example['tokens'])

array(['-', 'O', 'art', '.', '178', ',', 'II', ',', 'do', 'CPC',
       'prescreve', 'que', 'compete', 'ao', 'Ministério', 'Público',
       'intervir', 'nas', 'causas', 'em', 'que', 'há', 'interesses', 'de',
       'incapazes', ',', 'dispondo', 'o', 'art', '.', '279', 'do',
       'mesmo', 'diploma', 'que', 'o', 'processo', 'será', 'nulo',
       'quando', 'o', 'Ministério', 'Público', 'não', 'for', 'intimado',
       'para', 'acompanhar', 'o', 'feito', 'em', 'que', 'deve',
       'intervir', '.'], dtype='<U10')

In [15]:
np.array(tokens)

array(['[CLS]', '-', 'O', 'art', '.', '178', ',', 'II', ',', 'do', 'CP',
       '##C', 'pres', '##cre', '##ve', 'que', 'compet', '##e', 'ao',
       'Ministério', 'Público', 'inter', '##vir', 'nas', 'causas', 'em',
       'que', 'há', 'interesses', 'de', 'incapazes', ',', 'dispon',
       '##do', 'o', 'art', '.', '27', '##9', 'do', 'mesmo', 'diploma',
       'que', 'o', 'processo', 'será', 'nu', '##lo', 'quando', 'o',
       'Ministério', 'Público', 'não', 'for', 'intim', '##ado', 'para',
       'acompanhar', 'o', 'feito', 'em', 'que', 'deve', 'inter', '##vir',
       '.', '[SEP]'], dtype='<U10')

O tokenizer cria caracteres especiais '[CLS]' e '[SEP'], além de cortar palavras. A função abaixo contorna o problema ao atribuir - 100 aos caracteres especiais e tokens no meio das palavras.

In [16]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [17]:
q = tokenize_and_align_labels(dataset['train'][1:2])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [18]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q['input_ids'][0]), q['labels'][0]):
  print(f"{token:_<40} {label}")

[CLS]___________________________________ -100
-_______________________________________ 0
O_______________________________________ 0
art_____________________________________ 9
._______________________________________ 10
178_____________________________________ 10
,_______________________________________ 10
II______________________________________ 10
,_______________________________________ 10
do______________________________________ 10
CP______________________________________ 10
##C_____________________________________ -100
pres____________________________________ 0
##cre___________________________________ -100
##ve____________________________________ -100
que_____________________________________ 0
compet__________________________________ 0
##e_____________________________________ -100
ao______________________________________ 0
Ministério______________________________ 1
Público_________________________________ 2
inter___________________________________ 0
##vir___________________________

Criando dataset com função acima aplicada.

In [19]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Loading cached processed dataset at C:\Users\luanc\.cache\huggingface\datasets\lener_br\lener_br\1.0.0\4a8c97e6813b5c2d85a50faf0a3e6c24ea82f4a9044e6e9e8b24997d27399382\cache-b5e3af25a796df75.arrow
Loading cached processed dataset at C:\Users\luanc\.cache\huggingface\datasets\lener_br\lener_br\1.0.0\4a8c97e6813b5c2d85a50faf0a3e6c24ea82f4a9044e6e9e8b24997d27399382\cache-836f212fc79f7d78.arrow
Loading cached processed dataset at C:\Users\luanc\.cache\huggingface\datasets\lener_br\lener_br\1.0.0\4a8c97e6813b5c2d85a50faf0a3e6c24ea82f4a9044e6e9e8b24997d27399382\cache-500a7b4b2177cbe5.arrow


In [22]:
label_list = dataset['train'].features['ner_tags'].feature.names
label_list

['O',
 'B-ORGANIZACAO',
 'I-ORGANIZACAO',
 'B-PESSOA',
 'I-PESSOA',
 'B-TEMPO',
 'I-TEMPO',
 'B-LOCAL',
 'I-LOCAL',
 'B-LEGISLACAO',
 'I-LEGISLACAO',
 'B-JURISPRUDENCIA',
 'I-JURISPRUDENCIA']

In [23]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [24]:
seqeval = evaluate.load("seqeval")

In [25]:
import numpy as np

labels = [label_list]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [26]:
id2label = {
    0: "sem label",
    1: "B-ORGANIZACAO",
    2: "I-ORGANIZACAO",
    3: "B-PESSOA",
    4: "I-PESSOA",
    5: "B-TEMPO",
    6: "I-TEMPO",
    7: "B-LOCAL",
    8: "I-LOCAL",
    9: "B-LEGISLACAO",
    10: "I-LEGISLACAO",
    11: "B-JURISPRUDENCIA",
    12: "I-JURISPRUDENCIA"
}

In [27]:
label2id = {
    "sem label": 0,
    "B-ORGANIZACAO": 1,
    "I-ORGANIZACAO": 2,
    "B-PESSOA": 3,
    "I-PESSOA": 4,
    "B-TEMPO": 5,
    "I-TEMPO": 6,
    "B-LOCAL": 7,
    "I-LOCAL": 8,
    "B-LEGISLACAO": 9,
    "I-LEGISLACAO": 10,
    "B-JURISPRUDENCIA": 11,
    "I-JURISPRUDENCIA": 12
}

Finetune dos dados utilizando LeNER-Br tokenizado

In [28]:

model = AutoModelForTokenClassification.from_pretrained(
    'neuralmind/bert-base-portuguese-cased', num_labels=13, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir = '\\bert-fine-tuned',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub = False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Salvar dados

In [None]:
model.save_pretrained("bertimbau-base-lener-br")
tokenizer.save_pretrained("bertimbau-base-lener-br")