<a href="https://colab.research.google.com/github/pjpjean/rgeo/blob/main/Finetuning_NER_with_BERT_(Address_Parsing).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalação

In [1]:
import pandas as pd
import numpy as np
from contextlib import nullcontext
import tqdm
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from torch.optim import SGD, Adam

In [2]:
%%capture
!pip install transformers datasets evaluate seqeval

In [93]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import Trainer, TrainingArguments
from transformers import pipeline
from datasets import load_dataset
import evaluate

# Carga dos dados

In [4]:
dados = load_dataset('csv', data_files='/content/drive/MyDrive/Colab Notebooks/data/br_100mil.csv',
                     column_names=['address', 'tags'],
                     split='train')



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-28210ea4bfa6d9b7/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-28210ea4bfa6d9b7/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


In [5]:
dados[:4], dados[-5:]

({'address': ['r augusto bailao itaberaí go 76630-000',
   'rua capitao daindo 435 redentora rs 98550-000',
   'rua luiz montelatto 85 louveira sp 13290-000',
   'r apiacas 53 belford roxo rj 26115-280'],
  'tags': ['LOG LOG LOG MUN UF CEP',
   'LOG LOG LOG NUM MUN UF CEP',
   'LOG LOG LOG NUM MUN UF CEP',
   'LOG LOG NUM MUN MUN UF CEP']},
 {'address': ['rua ernesto duarte de almeida 21 são fidélis rj 28400-000',
   'rua dinamarca altos pi 64290-000',
   'rua sao martinho 1265 pradópolis sp 14850-000',
   'avenida coronel joao da mota ribeiro 331 bom jesus do amparo mg cep 35908-000',
   'estrada mgg 010 mogi guaçu sp cep 13840-000'],
  'tags': ['LOG LOG LOG LOG LOG NUM MUN MUN UF CEP',
   'LOG LOG MUN UF CEP',
   'LOG LOG LOG NUM MUN UF CEP',
   'LOG LOG LOG LOG LOG LOG NUM MUN MUN MUN MUN UF CEP CEP',
   'LOG LOG LOG MUN MUN UF CEP CEP']})

In [6]:
caracteres = {uc for t in dados['address'] for uc in t}
''.join(sorted(caracteres))

' "#\'()-.0123456789:=[]abcdefghijklmnopqrstuvwxyz|ªºáâãçéêíóôõú�'

In [7]:
outside_label = 'O'
labelset = {ul for l in dados['tags'] for ul in l.split()}
labelset = ([outside_label] + [l for l in labelset if l != outside_label])
label2id = {l: i for i, l in enumerate(labelset)}
id2label = {i: l for i, l in enumerate(labelset)}
labelset

['O', 'UF', 'LOG', 'CEP', 'MUN', 'NUM']

In [8]:
max_len = max([len(t) for t in dados['address']])
max_len

108

In [9]:
max_words = max([len(t.split()) for t in dados['address']])
max_words

19

# Preparação dos datasets

In [11]:
MAX_LEN = 32
IGNORED_LABEL_ID = -100
MODEL_CHECKPOINT = 'bert-base-uncased'
TAM_AMOSTRA = 1000

In [12]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
dset = dados.train_test_split(train_size=int(0.6 * TAM_AMOSTRA),
                              test_size=int(0.4 * TAM_AMOSTRA),
                              seed=1452320)
dset['test'], dset['validation'] = (dset['test']
                                    .train_test_split(test_size=0.5, seed=1452321)
                                    .values())

In [14]:
dset

DatasetDict({
    train: Dataset({
        features: ['address', 'tags'],
        num_rows: 600
    })
    test: Dataset({
        features: ['address', 'tags'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['address', 'tags'],
        num_rows: 200
    })
})

In [15]:
# Função auxiliar que, a partir do word_id, retorna o índice do label 
# respectivo na lista original de labels
def get_token_label_pos(word2token, word_id):
    if word_id is None:
        return None

    try:
        l_pos = word2token.index(word_id)
    except ValueError:
        l_pos = max([idx for idx, wid in enumerate(word2token) if wid <= word_id])

    return l_pos

# Rotina de tokenização dos dados
def tokenize_and_align_labels(data, label2id, outside_label):
    tokens = tokenizer(data['address'], truncation=True)

    new_labels = []
    outside_id = label2id.get(outside_label)
    for i, (text, tags) in enumerate(zip(data['address'], data['tags'])):
        labels = tags.split()

        # Posição das palavras conforme separação original do deepparse
        dp_word_pos = [0] + list(np.cumsum([len(t) + 1 for t in text.split()[:-1]]))

        # Identificação de qual token inicia cada palavra original
        dp_word_to_token = [tokens.char_to_word(i, wp) for wp in dp_word_pos]

        # Monta a lista de (ids de) rótulos realinhada com a tokenização do Bert
        token_ids = tokens.word_ids(i)
        label_pos = [get_token_label_pos(dp_word_to_token, wid) for wid in token_ids]
        new_labels.append([label2id.get(labels[pos], outside_id)
            if pos is not None else IGNORED_LABEL_ID
            for pos in label_pos])

    tokens["labels"] = new_labels
    return tokens

In [16]:
t_dset = dset.map(
    lambda x: tokenize_and_align_labels(x, label2id, outside_label),
    batched=True,
    remove_columns=dset["train"].column_names,    
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
dset['train'][120]

{'address': 'rua prof zuza 735 a natal rn 59025-160',
 'tags': 'LOG LOG LOG NUM NUM MUN UF CEP'}

In [18]:
t_dset['train'][120]

{'input_ids': [101,
  21766,
  2050,
  11268,
  16950,
  4143,
  6421,
  2629,
  1037,
  17489,
  29300,
  25186,
  17788,
  1011,
  8148,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 2, 2, 2, 2, 2, 5, 5, 5, 4, 1, 3, 3, 3, 3, -100]}

In [39]:
# Ajusta a sequência de labels para o padrão IOB
def iob_labels(labels, outside_label='O'):
    add_prefix = lambda l, a: l if l == outside_label else ('B-' if l != a else 'I-') + l
    iob = [add_prefix(l, lant) for l, lant in zip(labels, ['###'] + labels[:-1])]
    return iob

In [50]:
# Representação textual dos dados tokenizados
def get_text_repr(data, iob=False):

    is_nested = any(isinstance(item, list) for item in data['labels'])
    d_input_ids = data['input_ids'] if is_nested else [data['input_ids']]
    d_labels = data['labels'] if is_nested else [data['labels']]

    text = []
    tags = []
    for input_ids, labels in zip(d_input_ids, d_labels):
        valid_ids = [id for id in input_ids if id != 0]
        text.append(' '.join(tokenizer.convert_ids_to_tokens(valid_ids)))
        if iob:
            tags.append(' '.join(iob_labels(
                [id2label.get(i, outside_label) for i in labels[:len(valid_ids)]]
            )))
        else:
            tags.append(' '.join(
                id2label.get(i, outside_label) for i in labels[:len(valid_ids)]
            ))

    return text, tags

In [51]:
get_text_repr(t_dset['train'][120])

(['[CLS] ru ##a prof zu ##za 73 ##5 a natal rn 590 ##25 - 160 [SEP]'],
 ['O LOG LOG LOG LOG LOG NUM NUM NUM MUN UF CEP CEP CEP CEP O'])

In [52]:
get_text_repr(t_dset['train'][120], iob=True)

(['[CLS] ru ##a prof zu ##za 73 ##5 a natal rn 590 ##25 - 160 [SEP]'],
 ['O B-LOG I-LOG I-LOG I-LOG I-LOG B-NUM I-NUM I-NUM B-MUN B-UF B-CEP I-CEP I-CEP I-CEP O'])

# Especificação dos lotes e das metricas

In [53]:
collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load('seqeval')

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [86]:
# The function that will be used to compute metrics at evaluation.
# Must take a EvalPrediction and return a dictionary string to metric values.
def compute_metrics(eval_pred):
    # Remove ignored index (special tokens) and convert to labels
    true_label_ids = eval_pred.label_ids
    true_labels = [
        iob_labels(
            [id2label.get(l_id, outside_label)
                for l_id in label_ids
                if l_id != IGNORED_LABEL_ID]
        ) for label_ids in true_label_ids
    ]

    # Align prediction to true_labels and convert to labels
    pred_label_ids = np.argmax(eval_pred.predictions, axis=-1)
    pred_labels = [
        iob_labels(
            [id2label.get(p_id, outside_label)
                for (p_id, l_id) in zip(pred_ids, label_ids)
                if l_id != IGNORED_LABEL_ID]
        ) for pred_ids, label_ids in zip(pred_label_ids, true_label_ids)
    ]
    all_metrics = metric.compute(predictions=pred_labels, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# Inicialização do modelo BERT

In [None]:
%%capture
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_CHECKPOINT,
    id2label=id2label,
    label2id=label2id
)

model.to(device)

In [88]:
model.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "UF",
    "2": "LOG",
    "3": "CEP",
    "4": "MUN",
    "5": "NUM"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "CEP": 3,
    "LOG": 2,
    "MUN": 4,
    "NUM": 5,
    "O": 0,
    "UF": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Como a função perda do modelo é a *entropia cruzada*, o modelo ainda não treinado deve ter uma perda próxima da entropia máxima, que é de $-\ln(1/n)$, onde $n$ é o número de rótulos. No caso, $-ln(1/6)=1.79$.

In [89]:
tokens = collator([t_dset['train'][i] for i in range(10)])
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']
label_ids = tokens['labels']
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=label_ids)
outputs['loss']

tensor(1.8858, grad_fn=<NllLossBackward0>)

In [90]:
args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [91]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=t_dset["train"],
    eval_dataset=t_dset["validation"],
    data_collator=collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [92]:
trainer.train()

***** Running training *****
  Num examples = 600
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 225
  Number of trainable parameters = 108896262


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.105773,0.90766,0.915344,0.911486,0.962805
2,No log,0.04527,0.934375,0.949206,0.941732,0.980125
3,No log,0.041824,0.941667,0.956614,0.949081,0.982112


***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-75
Configuration saved in bert-finetuned-ner/checkpoint-75/config.json
Model weights saved in bert-finetuned-ner/checkpoint-75/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-75/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-75/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to bert-finetuned-ner/checkpoint-150
Configuration saved in bert-finetuned-ner/checkpoint-150/config.json
Model weights saved in bert-finetuned-ner/checkpoint-150/pytorch_model.bin
tokenizer config file saved in bert-finetuned-ner/checkpoint-150/tokenizer_config.json
Special tokens file saved in bert-finetuned-ner/checkpoint-150/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to bert-finetune

TrainOutput(global_step=225, training_loss=0.19037931654188367, metrics={'train_runtime': 647.4871, 'train_samples_per_second': 2.78, 'train_steps_per_second': 0.347, 'total_flos': 22129369338240.0, 'train_loss': 0.19037931654188367, 'epoch': 3.0})

In [94]:
token_classifier = pipeline(
    "token-classification", model='bert-finetuned-ner/checkpoint-225', aggregation_strategy="simple"
)

loading configuration file bert-finetuned-ner/checkpoint-225/config.json
Model config BertConfig {
  "_name_or_path": "bert-finetuned-ner/checkpoint-225",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "UF",
    "2": "LOG",
    "3": "CEP",
    "4": "MUN",
    "5": "NUM"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "CEP": 3,
    "LOG": 2,
    "MUN": 4,
    "NUM": 5,
    "O": 0,
    "UF": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.26.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522


In [95]:
token_classifier('rua mons. bruno 1005 ap 500 fortaleza ce 60050-020')

[{'entity_group': 'LOG',
  'score': 0.998127,
  'word': 'rua mons. bruno',
  'start': 0,
  'end': 15},
 {'entity_group': 'NUM',
  'score': 0.9644756,
  'word': '1005 ap 500',
  'start': 16,
  'end': 27},
 {'entity_group': 'MUN',
  'score': 0.9959838,
  'word': 'fortaleza',
  'start': 28,
  'end': 37},
 {'entity_group': 'UF',
  'score': 0.9947107,
  'word': 'ce',
  'start': 38,
  'end': 40},
 {'entity_group': 'CEP',
  'score': 0.99853754,
  'word': '60050 - 020',
  'start': 41,
  'end': 50}]