In [1]:
from datasets import load_dataset
import datasets
import numpy as np
from transformers import AutoTokenizer, AutoModel, AutoModelForTokenClassification, TokenClassificationPipeline,TrainingArguments,EarlyStoppingCallback, DataCollatorForTokenClassification, Trainer
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
import spacy
from sklearn_crfsuite import CRF, metrics
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [6]:
nlp = spacy.load("pt_core_news_sm")

In [5]:
dataset = load_dataset("lener_br", download_mode="force_redownload")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.11k [00:00<?, ?B/s]

lener_br.py:   0%|          | 0.00/5.84k [00:00<?, ?B/s]

lener_br.py:   0%|          | 0.00/5.84k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.11k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/70.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/94.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7828 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1177 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1390 [00:00<?, ? examples/s]

In [4]:
print(dataset['train'].description)


LeNER-Br is a Portuguese language dataset for named entity recognition
applied to legal documents. LeNER-Br consists entirely of manually annotated
legislation and legal cases texts and contains tags for persons, locations,
time entities, organizations, legislation and legal cases.
To compose the dataset, 66 legal documents from several Brazilian Courts were
collected. Courts of superior and state levels were considered, such as Supremo
Tribunal Federal, Superior Tribunal de Justiça, Tribunal de Justiça de Minas
Gerais and Tribunal de Contas da União. In addition, four legislation documents
were collected, such as "Lei Maria da Penha", giving a total of 70 documents



In [5]:
print(dataset['train'].citation)


@inproceedings{luz_etal_propor2018,
    author = {Pedro H. {Luz de Araujo} and Te'{o}filo E. {de Campos} and
    Renato R. R. {de Oliveira} and Matheus Stauffer and
    Samuel Couto and Paulo Bermejo},
    title = {{LeNER-Br}: a Dataset for Named Entity Recognition in {Brazilian} Legal Text},
    booktitle = {International Conference on the Computational Processing of Portuguese ({PROPOR})},
    publisher = {Springer},
    series = {Lecture Notes on Computer Science ({LNCS})},
    pages = {313--323},
    year = {2018},
    month = {September 24-26},
    address = {Canela, RS, Brazil},
    doi = {10.1007/978-3-319-99722-3_32},
    url = {https://cic.unb.br/~teodecampos/LeNER-Br/},
}



In [6]:
print(dataset['train'].homepage)

https://cic.unb.br/~teodecampos/LeNER-Br/


In [7]:
label_list = dataset["train"].features["ner_tags"].feature.names
print(label_list)

['O', 'B-ORGANIZACAO', 'I-ORGANIZACAO', 'B-PESSOA', 'I-PESSOA', 'B-TEMPO', 'I-TEMPO', 'B-LOCAL', 'I-LOCAL', 'B-LEGISLACAO', 'I-LEGISLACAO', 'B-JURISPRUDENCIA', 'I-JURISPRUDENCIA']


## CRF

In [7]:
small_train = dataset["train"].shuffle(seed=42).select(range(1000))
small_test = dataset["test"].shuffle(seed=42).select(range(200))

In [29]:
def convert_to_crf_format(dataset_split):
    crf_data = []
    for example in dataset_split:
        tokens = example["tokens"]
        labels = [label_list[tag] for tag in example["ner_tags"]]
        crf_data.append(list(zip(tokens, labels)))
    return crf_data

train_data = convert_to_crf_format(small_train)
test_data = convert_to_crf_format(small_test)

In [30]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent): return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent): return [label for _, label in sent]

In [31]:
X_train = [sent2features(s) for s in train_data]
y_train = [sent2labels(s) for s in train_data]

X_test = [sent2features(s) for s in test_data]
y_test = [sent2labels(s) for s in test_data]

In [32]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=200,
    all_possible_transitions=True
)

crf.fit(X_train, y_train)
y_pred = crf.predict(X_test)

print(metrics.flat_classification_report(y_test, y_pred, digits=3))

                  precision    recall  f1-score   support

B-JURISPRUDENCIA      0.833     0.556     0.667        27
    B-LEGISLACAO      0.840     0.792     0.816        53
         B-LOCAL      0.455     0.500     0.476        10
   B-ORGANIZACAO      0.878     0.489     0.628        88
        B-PESSOA      0.850     0.567     0.680        30
         B-TEMPO      1.000     0.261     0.414        23
I-JURISPRUDENCIA      0.615     0.914     0.736        35
    I-LEGISLACAO      0.854     0.920     0.886       337
         I-LOCAL      0.375     0.353     0.364        17
   I-ORGANIZACAO      0.859     0.433     0.575       141
        I-PESSOA      0.983     0.781     0.870        73
         I-TEMPO      1.000     1.000     1.000        12
               O      0.971     0.991     0.981      6010

        accuracy                          0.957      6856
       macro avg      0.809     0.658     0.699      6856
    weighted avg      0.956     0.957     0.953      6856



In [33]:
crf2 = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=50,
    all_possible_transitions=True
)

crf2.fit(X_train, y_train)
y_pred = crf2.predict(X_test)

print(metrics.flat_classification_report(y_test, y_pred, digits=3))

                  precision    recall  f1-score   support

B-JURISPRUDENCIA      0.800     0.444     0.571        27
    B-LEGISLACAO      0.857     0.792     0.824        53
         B-LOCAL      0.455     0.500     0.476        10
   B-ORGANIZACAO      0.875     0.557     0.681        88
        B-PESSOA      0.900     0.600     0.720        30
         B-TEMPO      1.000     0.261     0.414        23
I-JURISPRUDENCIA      0.644     0.829     0.725        35
    I-LEGISLACAO      0.941     0.855     0.896       337
         I-LOCAL      0.333     0.294     0.312        17
   I-ORGANIZACAO      0.852     0.489     0.622       141
        I-PESSOA      1.000     0.781     0.877        73
         I-TEMPO      1.000     1.000     1.000        12
               O      0.966     0.994     0.980      6010

        accuracy                          0.958      6856
       macro avg      0.817     0.646     0.700      6856
    weighted avg      0.956     0.958     0.954      6856



## Bertimbau

In [4]:
model_name = "neuralmind/bert-base-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
label_list = dataset["train"].features["ner_tags"].feature.names
print(label_list)

['O', 'B-ORGANIZACAO', 'I-ORGANIZACAO', 'B-PESSOA', 'I-PESSOA', 'B-TEMPO', 'I-TEMPO', 'B-LOCAL', 'I-LOCAL', 'B-LEGISLACAO', 'I-LEGISLACAO', 'B-JURISPRUDENCIA', 'I-JURISPRUDENCIA']


In [6]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding=False,
        max_length=128,
        is_split_into_words=True,
    )

    labels = []
    for i, word_ids in enumerate(tokenized_inputs.word_ids(batch_index=i) for i in range(len(examples["tokens"]))):
        example_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                example_labels.append(-100)
            elif word_idx != previous_word_idx:
                example_labels.append(examples["ner_tags"][i][word_idx])
            else:
                example_labels.append(-100)
            previous_word_idx = word_idx
        labels.append(example_labels)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [7]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, batch_size=50)

Map:   0%|          | 0/7828 [00:00<?, ? examples/s]

Map:   0%|          | 0/1177 [00:00<?, ? examples/s]

Map:   0%|          | 0/1390 [00:00<?, ? examples/s]

In [8]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [9]:
label_list = dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)
id_to_label = {i: l for i, l in enumerate(label_list)}
label_to_id = {l: i for i, l in enumerate(label_list)}

In [10]:
small_train = tokenized_dataset["train"].shuffle(seed=42).select(range(1000))
small_val = tokenized_dataset["validation"].shuffle(seed=42).select(range(200))

In [12]:
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id_to_label,
    label2id=label_to_id,
    ignore_mismatched_sizes=True
)

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    num_train_epochs=4,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_steps=50,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

results = trainer.evaluate()
print("Resultados de avaliação:", results)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.4742,0.193278
2,0.0816,0.139762


Epoch,Training Loss,Validation Loss
1,0.4742,0.193278
2,0.0816,0.139762
3,0.0318,0.122048
4,0.017,0.131255


Resultados de avaliação: {'eval_loss': 0.12204776704311371, 'eval_runtime': 77.5694, 'eval_samples_per_second': 2.578, 'eval_steps_per_second': 0.129, 'epoch': 4.0}


In [25]:
predictions_output = trainer.predict(tokenized_dataset["test"].shuffle(seed=42).select(range(100)))

preds = predictions_output.predictions
labels = predictions_output.label_ids

preds = np.argmax(preds, axis=-1)

true_labels = []
true_predictions = []

for pred, label in zip(preds, labels):
    temp_labels = []
    temp_preds = []
    for p_, l_ in zip(pred, label):
        if l_ != -100:
            temp_labels.append(id_to_label[int(l_)])
            temp_preds.append(id_to_label[int(p_)])
    true_labels.append(temp_labels)
    true_predictions.append(temp_preds)

In [26]:
print("\nRelatório completo:")
print(classification_report(true_labels, true_predictions))


Relatório completo:
                precision    recall  f1-score   support

JURISPRUDENCIA       0.86      1.00      0.92        12
    LEGISLACAO       1.00      0.86      0.92        14
         LOCAL       0.75      1.00      0.86         3
   ORGANIZACAO       0.57      0.72      0.63        36
        PESSOA       0.63      0.86      0.73        14
         TEMPO       1.00      0.56      0.71         9

     micro avg       0.70      0.80      0.74        88
     macro avg       0.80      0.83      0.80        88
  weighted avg       0.74      0.80      0.75        88

