<a href="https://colab.research.google.com/github/pjpjean/rgeo/blob/main/Finetuning_NER_with_BERT_(Address_Parsing).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalação

In [2]:
import pandas as pd
import numpy as np
from contextlib import nullcontext
import tqdm
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from torch.optim import SGD, Adam

In [None]:
%%capture
!pip install transformers evaluate seqeval

In [14]:
from transformers import BertTokenizerFast, BertForTokenClassification
import evaluate

# Carga dos dados

In [15]:
dados = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/br_100mil.csv', names = ['text', 'labels'])
dados

Unnamed: 0,text,labels
0,r augusto bailao itaberaí go 76630-000,LOG LOG LOG MUN UF CEP
1,rua capitao daindo 435 redentora rs 98550-000,LOG LOG LOG NUM MUN UF CEP
2,rua luiz montelatto 85 louveira sp 13290-000,LOG LOG LOG NUM MUN UF CEP
3,r apiacas 53 belford roxo rj 26115-280,LOG LOG NUM MUN MUN UF CEP
4,estrada do curral novo 63 nova iguaçu rio de j...,LOG LOG LOG LOG NUM MUN MUN UF UF UF CEP
...,...,...
99605,rua ernesto duarte de almeida 21 são fidélis r...,LOG LOG LOG LOG LOG NUM MUN MUN UF CEP
99606,rua dinamarca altos pi 64290-000,LOG LOG MUN UF CEP
99607,rua sao martinho 1265 pradópolis sp 14850-000,LOG LOG LOG NUM MUN UF CEP
99608,avenida coronel joao da mota ribeiro 331 bom j...,LOG LOG LOG LOG LOG LOG NUM MUN MUN MUN MUN UF...


In [16]:
caracteres = {uc for t in dados['text'] for uc in t}
''.join(sorted(caracteres))

' "#\'()-.0123456789:=[]abcdefghijklmnopqrstuvwxyz|ªºáâãçéêíóôõú�'

In [17]:
labels = {ul for l in dados['labels'] for ul in l.split()}
labels

{'CEP', 'LOG', 'MUN', 'NUM', 'UF'}

In [18]:
max_len = max([len(t) for t in dados['text']])
max_len

108

In [19]:
max_words = max([len(t.split()) for t in dados['text']])
max_words

19

In [20]:
dados['text'][dados['text'].str.contains(r'\D-\D')]

89                  ave beira mar barra-velha sc 88390-000
402      rua sao paulo 1670 são-caetano-do-s sp cep 095...
479      rua juiz niltho leite 60 volta-redonda rj cep ...
591      estrada santa barbara ao alto da serra abelard...
685              rodovia do cafe nova-venécia es 29830-000
                               ...                        
99269            rua vespasiano 445 são-paulo sp 05044-050
99384    rua joao antonio rodrigues 13 embu-guaçu sp 06...
99513         r pernambuco 625 arroio-do-meio rs 95940-000
99560        r padre vitorio igarapé-miri pa cep 68430-000
99593        estrada da laguna capão-da-canoa rs 95555-000
Name: text, Length: 678, dtype: object

# Preparação dos datasets

In [21]:
MAX_LEN = 32

In [22]:
class DeepparseDataset(Dataset):
    """Deepparse CSV dataset."""

    def __init__(self, dataframe, labelset=None, outside_label='O'):
        self.dataframe = dataframe

        if labelset:
            self.labelset = labelset
        else:
            self.labelset = sorted(list({ul for l in dataframe['labels'] for ul in l.split()}))

        # Garante que o rótulo do elemento 'O' (outside) será sempre o primeiro
        self.outside_label = outside_label
        self.labelset = ([self.outside_label] + 
                         [l for l in self.labelset if l != self.outside_label])

        self._label_ids = {l: i for i, l in enumerate(self.labelset)}
        self._id_labels = {i: l for i, l in enumerate(self.labelset)}
        self._tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx, :]['text']
        tokens = self._tokenizer(text, padding='max_length', max_length = MAX_LEN,
                                 truncation=True, return_tensors="pt")

        # Os rótulos originais do deepparse consideram palavras separadas por
        # espaços. Os tokenizadores do Bert quebram os termos em subpalavras, criando
        # em geral mais tokens do que palavras da rotulação original. É preciso 
        # realinhar os rótulos com a nova divisão. 
        labels = self.dataframe.iloc[idx, :]['labels'].split()

        # Posição das palavras conforme separação original do deepparse
        dp_word_pos = [0] + list(np.cumsum([len(t) + 1 for t in text.split()[:-1]]))

        # Identificação de qual token inicia cada palavra original
        dp_word_to_token = [tokens.char_to_word(wp) for wp in dp_word_pos]

        # Função auxiliar que, a partir do id do token, retorna a posição do
        # label correspondente na lista original de labels
        def get_token_label(map_wt, token_id):
            if token_id is None:
                return None

            try:
                l_idx = map_wt.index(token_id)
            except ValueError:
                l_idx = max([idx for idx, wid in enumerate(map_wt) if wid <= token_id])

            return l_idx

        # Monta a lista de (ids de) rótulos realinhada com a tokenização do Bert
        outside_id = self._label_ids.get(self.outside_label)
        token_ids = tokens.word_ids()
        label_pos = [get_token_label(dp_word_to_token, wid) for wid in token_ids]
        label_ids = torch.LongTensor(
            [self._label_ids.get(labels[lp], outside_id)
                if lp is not None else outside_id
                for lp in label_pos])

        return tokens, label_ids

    def get_text_rep(self, item, keep_padding=False):
        """
        A partir da tupla retornada por __getitem__, retorna uma representação
        textual para facilitar análises
        """
        tokens, label_ids = item

        # Obtém apenas o primeiro elemento do tensor
        input_ids = tokens['input_ids'].tolist()[0]
        if not keep_padding:
            input_ids = [id for id in input_ids if id != 0]
        label_ids = label_ids.tolist()[:len(input_ids)]

        txt_tokens = ' '.join(self._tokenizer.convert_ids_to_tokens(input_ids))
        txt_labels = ' '.join([self._id_labels.get(i, '[-]') for i in label_ids])

        return txt_tokens, txt_labels

In [23]:
dset = DeepparseDataset(dados.sample(1000, random_state=1452320))
dset_trn, dset_val, dset_tst = random_split(dset, [0.6, 0.2, 0.2],
                                            generator=torch.Generator().manual_seed(1452321))

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [24]:
len(dset_trn), len(dset_val), len(dset_tst)

(600, 200, 200)

In [25]:
dset.labelset

['O', 'CEP', 'LOG', 'MUN', 'NUM', 'UF']

In [15]:
dset_trn[120]

({'input_ids': tensor([[  101, 14115,  2140,  2079,  8096, 17166, 13955, 28621,  6643,  6273,
           2581, 10790,  1011,  2199,   102,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0]])},
 tensor([0, 2, 2, 2, 2, 2, 3, 3, 5, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]))

In [16]:
dset.get_text_rep(dset_trn[120])

('[CLS] rama ##l do cruz ##eiro mara ##cana pa 68 ##7 ##10 - 000 [SEP]',
 'O LOG LOG LOG LOG LOG MUN MUN UF CEP CEP CEP CEP CEP O')

In [17]:
dset.dataframe.iloc[dset_trn.indices[120], :]

text      ramal do cruzeiro maracanã pa 68710-000
labels                     LOG LOG LOG MUN UF CEP
Name: 41783, dtype: object

# Inicialização do modelo BERT

In [18]:
%%capture
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

model = BertForTokenClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(dset.labelset),
    id2label=dset._id_labels,
    label2id=dset._label_ids)

model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [19]:
model.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "CEP",
    "2": "LOG",
    "3": "MUN",
    "4": "NUM",
    "5": "UF"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "CEP": 1,
    "LOG": 2,
    "MUN": 3,
    "NUM": 4,
    "O": 0,
    "UF": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Como a função perda do modelo é a *entropia cruzada*, o modelo ainda não treinado deve ter uma perda próxima da entropia máxima, que é de $-\ln(1/n)$, onde $n$ é o número de rótulos. No caso, $-ln(1/6)=1.79$.

In [20]:
tokens, label_ids = dset_trn[0]
input_ids = tokens['input_ids'].to(device)
attention_mask = tokens['attention_mask'].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=label_ids)
outputs['loss']

tensor(1.8641, grad_fn=<NllLossBackward0>)

In [102]:
BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

#LEARNING_RATE = 5e-3
#EPOCHS = 5
#BATCH_SIZE = 2

In [54]:
tokens['input_ids'].shape, tokens['input_ids'].squeeze(0).shape

(torch.Size([1, 32]), torch.Size([32]))

In [41]:
outputs['logits'].view(-1, 6)

tensor([[-0.4184, -0.2407,  0.4331,  0.0615, -0.1948,  0.1944],
        [-0.1995, -0.3228,  0.1269,  0.0716, -0.4840, -0.2923],
        [-0.0656, -0.3456, -0.0405,  0.0765,  0.1081, -0.4086],
        [ 0.2763,  0.1454,  0.3541,  0.2043, -0.0893,  0.1717],
        [ 0.5258, -0.0744,  0.3883,  0.0057, -0.0297, -0.1559],
        [ 0.4527,  0.3354,  0.4728,  0.0216, -0.3497, -0.0330],
        [-0.0508, -0.0484,  0.3716, -0.2226, -0.1615, -0.6703],
        [ 0.6507,  0.2174,  0.2447,  0.4357, -0.6709,  0.2212],
        [ 0.4629,  0.0531,  0.5840, -0.0717, -0.1690,  0.1795],
        [ 0.3765,  0.0114,  0.3430, -0.1083, -0.0689, -0.2839],
        [ 0.3227, -0.0015,  0.4579,  0.0266, -0.0169, -0.0295],
        [-0.0713, -0.0690,  0.7145, -0.2690, -0.2822,  0.1704],
        [ 0.0322, -0.0017,  0.5979, -0.3844, -0.0681,  0.1792],
        [ 0.0235, -0.0199,  0.5832, -0.3131, -0.3797,  0.3717],
        [ 0.1132, -0.0786,  0.4356, -0.1574,  0.1944,  0.0758],
        [ 0.0081, -0.3694,  0.4596, -0.0

In [51]:
outputs['logits'].squeeze(0).shape, outputs['logits'].view(-1, 6).shape

(torch.Size([32, 6]), torch.Size([32, 6]))

In [66]:
outputs['logits'].view(-1).shape

torch.Size([192])

In [63]:
tokens['attention_mask'].view(-1) == 1

tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False])

In [38]:
label_ids.shape, label_ids.unsqueeze(0).shape, label_ids.unsqueeze(0).view(-1).shape

(torch.Size([32]), torch.Size([1, 32]), torch.Size([32]))

In [None]:
def model_loop(model, loader, optimizer=None, train=False, print_steps=100):
    total_loss = 0.0
    total_accuracy = 0.0
    predictions = []

    # Configura modo de treinamento ou de avaliação
    model.train(mode=train)
    ctx_grad = nullcontext() if train else torch.nograd()

    with ctx_grad:
        for i, batch in enumerate(loader):
            b_tokens, b_labels = batch

            input_ids = b_tokens['input_ids'].to(device)
            attention_mask = b_tokens['attention_mask'].to(device)
            label_ids = b_labels.to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=label_ids)
            loss = outputs['loss']
            logits = outputs['logits']

            total_loss += loss

            if (print_steps) and ((i + 1) % print_steps == 0):
                print(f'Loss after {i + 1} steps: {total_loss / (i + 1)}')










In [50]:
def train_loop(model, train_dataset, val_dataset):

    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm.tqdm(train_dataloader):

            input_id = train_data['input_ids'].squeeze(1).to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            train_label = train_label.to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][train_label[i] > 0]
              label_clean = train_label[i][train_label[i] > 0]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_train += acc
              total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][val_label[i] > 0]
              label_clean = val_label[i][val_label[i] > 0]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_val += acc
              total_loss_val += loss.item()

        print(
            f'Epochs: {epoch_num + 1} | '
            f'Loss: {total_loss_train / len(train_dataset): .3f} | '
            f'Accuracy: {total_acc_train / len(train_dataset): .3f} | '
            f'Val_Loss: {total_loss_val / len(val_dataset): .3f} | '
            f'Accuracy: {total_acc_val / len(val_dataset): .3f}')

In [103]:
#optimizer = SGD(model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [52]:
train_loop(model, dset_trn, dset_val)

100%|██████████| 337/337 [08:09<00:00,  1.45s/it]


Epochs: 1 | Loss:  0.273 | Accuracy:  0.917 | Val_Loss:  0.055 | Accuracy:  0.984


100%|██████████| 337/337 [08:08<00:00,  1.45s/it]


Epochs: 2 | Loss:  0.059 | Accuracy:  0.985 | Val_Loss:  0.034 | Accuracy:  0.990


100%|██████████| 337/337 [08:05<00:00,  1.44s/it]


Epochs: 3 | Loss:  0.035 | Accuracy:  0.990 | Val_Loss:  0.027 | Accuracy:  0.992


100%|██████████| 337/337 [08:07<00:00,  1.45s/it]


Epochs: 4 | Loss:  0.020 | Accuracy:  0.995 | Val_Loss:  0.024 | Accuracy:  0.992


100%|██████████| 337/337 [08:11<00:00,  1.46s/it]


Epochs: 5 | Loss:  0.013 | Accuracy:  0.997 | Val_Loss:  0.025 | Accuracy:  0.993


In [8]:
import evaluate
seqeval = evaluate.load('seqeval')

In [13]:
t_predictions = [['B-LOG', 'I-LOG', 'I-LOG', 'I-LOG', 'B-MUN', 'I-MUN'], ['B-CEP', 'I-CEP', 'O']]
t_references = [['B-LOG', 'I-LOG', 'I-LOG', 'O', 'B-MUN', 'I-MUN'], ['B-CEP', 'I-CEP', 'O']]
seqeval.compute(predictions=t_predictions, references=t_references)

{'CEP': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'LOG': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'MUN': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 0.6666666666666666,
 'overall_recall': 0.6666666666666666,
 'overall_f1': 0.6666666666666666,
 'overall_accuracy': 0.8888888888888888}

In [None]:
evaluate.compute()