<a href="https://colab.research.google.com/github/pjpjean/rgeo/blob/main/Finetuning_NER_with_DistillBERT_(Address_Parsing).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalação

In [1]:
import pandas as pd
import numpy as np
from contextlib import nullcontext
import tqdm
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from torch.optim import SGD, Adam

In [2]:
%%capture
!pip install transformers datasets evaluate seqeval

In [24]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from transformers import Trainer, TrainingArguments
from transformers import pipeline
from datasets import load_dataset
import evaluate

# Carga dos dados

In [4]:
dados = load_dataset('csv', data_files='/content/drive/MyDrive/Colab Notebooks/data/br_100mil.csv',
                     column_names=['address', 'tags'],
                     split='train')



In [5]:
dados[:4], dados[-5:]

({'address': ['r augusto bailao itaberaí go 76630-000',
   'rua capitao daindo 435 redentora rs 98550-000',
   'rua luiz montelatto 85 louveira sp 13290-000',
   'r apiacas 53 belford roxo rj 26115-280'],
  'tags': ['LOG LOG LOG MUN UF CEP',
   'LOG LOG LOG NUM MUN UF CEP',
   'LOG LOG LOG NUM MUN UF CEP',
   'LOG LOG NUM MUN MUN UF CEP']},
 {'address': ['rua ernesto duarte de almeida 21 são fidélis rj 28400-000',
   'rua dinamarca altos pi 64290-000',
   'rua sao martinho 1265 pradópolis sp 14850-000',
   'avenida coronel joao da mota ribeiro 331 bom jesus do amparo mg cep 35908-000',
   'estrada mgg 010 mogi guaçu sp cep 13840-000'],
  'tags': ['LOG LOG LOG LOG LOG NUM MUN MUN UF CEP',
   'LOG LOG MUN UF CEP',
   'LOG LOG LOG NUM MUN UF CEP',
   'LOG LOG LOG LOG LOG LOG NUM MUN MUN MUN MUN UF CEP CEP',
   'LOG LOG LOG MUN MUN UF CEP CEP']})

In [6]:
caracteres = {uc for t in dados['address'] for uc in t}
''.join(sorted(caracteres))

' "#\'()-.0123456789:=[]abcdefghijklmnopqrstuvwxyz|ªºáâãçéêíóôõú�'

In [7]:
outside_label = 'O'
labelset = {ul for l in dados['tags'] for ul in l.split()}
labelset = ([outside_label] + [l for l in labelset if l != outside_label])
label2id = {l: i for i, l in enumerate(labelset)}
id2label = {i: l for i, l in enumerate(labelset)}
labelset

['O', 'NUM', 'CEP', 'MUN', 'LOG', 'UF']

In [8]:
max_len = max([len(t) for t in dados['address']])
max_len

108

In [9]:
max_words = max([len(t.split()) for t in dados['address']])
max_words

19

# Preparação dos datasets

In [11]:
MAX_LEN = 32
IGNORED_LABEL_ID = -100
MODEL_CHECKPOINT = 'distilbert-base-uncased'
TAM_AMOSTRA = 1000

In [12]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
dset = dados.train_test_split(train_size=int(0.6 * TAM_AMOSTRA),
                              test_size=int(0.4 * TAM_AMOSTRA),
                              seed=1452320)
dset['test'], dset['validation'] = (dset['test']
                                    .train_test_split(test_size=0.5, seed=1452321)
                                    .values())



In [14]:
dset

DatasetDict({
    train: Dataset({
        features: ['address', 'tags'],
        num_rows: 600
    })
    test: Dataset({
        features: ['address', 'tags'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['address', 'tags'],
        num_rows: 200
    })
})

In [15]:
# Função auxiliar que, a partir do word_id, retorna o índice do label 
# respectivo na lista original de labels
def get_token_label_pos(word2token, word_id):
    if word_id is None:
        return None

    try:
        l_pos = word2token.index(word_id)
    except ValueError:
        l_pos = max([idx for idx, wid in enumerate(word2token) if wid <= word_id])

    return l_pos

# Rotina de tokenização dos dados
def tokenize_and_align_labels(data, label2id, outside_label):
    tokens = tokenizer(data['address'], truncation=True)

    new_labels = []
    outside_id = label2id.get(outside_label)
    for i, (text, tags) in enumerate(zip(data['address'], data['tags'])):
        labels = tags.split()

        # Posição das palavras conforme separação original do deepparse
        dp_word_pos = [0] + list(np.cumsum([len(t) + 1 for t in text.split()[:-1]]))

        # Identificação de qual token inicia cada palavra original
        dp_word_to_token = [tokens.char_to_word(i, wp) for wp in dp_word_pos]

        # Monta a lista de (ids de) rótulos realinhada com a tokenização do Bert
        token_ids = tokens.word_ids(i)
        label_pos = [get_token_label_pos(dp_word_to_token, wid) for wid in token_ids]
        new_labels.append([label2id.get(labels[pos], outside_id)
            if pos is not None else IGNORED_LABEL_ID
            for pos in label_pos])

    tokens["labels"] = new_labels
    return tokens

In [16]:
t_dset = dset.map(
    lambda x: tokenize_and_align_labels(x, label2id, outside_label),
    batched=True,
    remove_columns=dset["train"].column_names,    
)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
dset['train'][120]

{'address': 'rua prof zuza 735 a natal rn 59025-160',
 'tags': 'LOG LOG LOG NUM NUM MUN UF CEP'}

In [18]:
t_dset['train'][120]

{'input_ids': [101,
  21766,
  2050,
  11268,
  16950,
  4143,
  6421,
  2629,
  1037,
  17489,
  29300,
  25186,
  17788,
  1011,
  8148,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 4, 4, 4, 4, 4, 1, 1, 1, 3, 5, 2, 2, 2, 2, -100]}

In [19]:
# Ajusta a sequência de labels para o padrão IOB
def iob_labels(labels, outside_label='O'):
    add_prefix = lambda l, a: l if l == outside_label else ('B-' if l != a else 'I-') + l
    iob = [add_prefix(l, lant) for l, lant in zip(labels, ['###'] + labels[:-1])]
    return iob

In [20]:
# Representação textual dos dados tokenizados
def get_text_repr(data, iob=False):

    is_nested = any(isinstance(item, list) for item in data['labels'])
    d_input_ids = data['input_ids'] if is_nested else [data['input_ids']]
    d_labels = data['labels'] if is_nested else [data['labels']]

    text = []
    tags = []
    for input_ids, labels in zip(d_input_ids, d_labels):
        valid_ids = [id for id in input_ids if id != 0]
        text.append(' '.join(tokenizer.convert_ids_to_tokens(valid_ids)))
        if iob:
            tags.append(' '.join(iob_labels(
                [id2label.get(i, outside_label) for i in labels[:len(valid_ids)]]
            )))
        else:
            tags.append(' '.join(
                id2label.get(i, outside_label) for i in labels[:len(valid_ids)]
            ))

    return text, tags

In [21]:
get_text_repr(t_dset['train'][120])

(['[CLS] ru ##a prof zu ##za 73 ##5 a natal rn 590 ##25 - 160 [SEP]'],
 ['O LOG LOG LOG LOG LOG NUM NUM NUM MUN UF CEP CEP CEP CEP O'])

In [22]:
get_text_repr(t_dset['train'][120], iob=True)

(['[CLS] ru ##a prof zu ##za 73 ##5 a natal rn 590 ##25 - 160 [SEP]'],
 ['O B-LOG I-LOG I-LOG I-LOG I-LOG B-NUM I-NUM I-NUM B-MUN B-UF B-CEP I-CEP I-CEP I-CEP O'])

# Especificação dos lotes e das metricas

In [25]:
collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
metric = evaluate.load('seqeval')

In [26]:
# The function that will be used to compute metrics at evaluation.
# Must take a EvalPrediction and return a dictionary string to metric values.
def compute_metrics(eval_pred):
    # Remove ignored index (special tokens) and convert to labels
    true_label_ids = eval_pred.label_ids
    true_labels = [
        iob_labels(
            [id2label.get(l_id, outside_label)
                for l_id in label_ids
                if l_id != IGNORED_LABEL_ID]
        ) for label_ids in true_label_ids
    ]

    # Align prediction to true_labels and convert to labels
    pred_label_ids = np.argmax(eval_pred.predictions, axis=-1)
    pred_labels = [
        iob_labels(
            [id2label.get(p_id, outside_label)
                for (p_id, l_id) in zip(pred_ids, label_ids)
                if l_id != IGNORED_LABEL_ID]
        ) for pred_ids, label_ids in zip(pred_label_ids, true_label_ids)
    ]
    all_metrics = metric.compute(predictions=pred_labels, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

# Inicialização do modelo BERT

In [27]:
%%capture
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_CHECKPOINT,
    id2label=id2label,
    label2id=label2id
)

model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [28]:
model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "NUM",
    "2": "CEP",
    "3": "MUN",
    "4": "LOG",
    "5": "UF"
  },
  "initializer_range": 0.02,
  "label2id": {
    "CEP": 2,
    "LOG": 4,
    "MUN": 3,
    "NUM": 1,
    "O": 0,
    "UF": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.0",
  "vocab_size": 30522
}

Como a função perda do modelo é a *entropia cruzada*, o modelo ainda não treinado deve ter uma perda próxima da entropia máxima, que é de $-\ln(1/n)$, onde $n$ é o número de rótulos. No caso, $-ln(1/6)=1.79$.

In [29]:
tokens = collator([t_dset['train'][i] for i in range(10)])
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']
label_ids = tokens['labels']
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=label_ids)
outputs['loss']

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor(1.9145, grad_fn=<NllLossBackward0>)

In [30]:
args = TrainingArguments(
    "distilbert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01
)



In [31]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=t_dset["train"],
    eval_dataset=t_dset["validation"],
    data_collator=collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [32]:
trainer.train()

***** Running training *****
  Num examples = 600
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 225
  Number of trainable parameters = 66367494


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.108598,0.885057,0.896296,0.890641,0.961102
2,No log,0.068496,0.920084,0.925926,0.922996,0.973027
3,No log,0.061523,0.919875,0.93545,0.927597,0.975014


***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to distilbert-finetuned-ner/checkpoint-75
Configuration saved in distilbert-finetuned-ner/checkpoint-75/config.json
Model weights saved in distilbert-finetuned-ner/checkpoint-75/pytorch_model.bin
tokenizer config file saved in distilbert-finetuned-ner/checkpoint-75/tokenizer_config.json
Special tokens file saved in distilbert-finetuned-ner/checkpoint-75/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to distilbert-finetuned-ner/checkpoint-150
Configuration saved in distilbert-finetuned-ner/checkpoint-150/config.json
Model weights saved in distilbert-finetuned-ner/checkpoint-150/pytorch_model.bin
tokenizer config file saved in distilbert-finetuned-ner/checkpoint-150/tokenizer_config.json
Special tokens file saved in distilbert-finetuned-ner/checkpoint-150/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 2

TrainOutput(global_step=225, training_loss=0.23616021050347222, metrics={'train_runtime': 336.9177, 'train_samples_per_second': 5.343, 'train_steps_per_second': 0.668, 'total_flos': 11065484661120.0, 'train_loss': 0.23616021050347222, 'epoch': 3.0})

In [34]:
token_classifier = pipeline(
    "token-classification", model='distilbert-finetuned-ner/checkpoint-225', aggregation_strategy="simple"
)

loading configuration file distilbert-finetuned-ner/checkpoint-225/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-finetuned-ner/checkpoint-225",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "O",
    "1": "NUM",
    "2": "CEP",
    "3": "MUN",
    "4": "LOG",
    "5": "UF"
  },
  "initializer_range": 0.02,
  "label2id": {
    "CEP": 2,
    "LOG": 4,
    "MUN": 3,
    "NUM": 1,
    "O": 0,
    "UF": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.26.0",
  "vocab_size": 30522
}

loading configuration file distilbert-finetuned-ner/checkpoint-225/config.json
Model config Disti

In [35]:
token_classifier('rua mons. bruno 1005 ap 500 fortaleza ce 60050-020')

[{'entity_group': 'LOG',
  'score': 0.9968754,
  'word': 'rua mons. bruno',
  'start': 0,
  'end': 15},
 {'entity_group': 'NUM',
  'score': 0.9166573,
  'word': '1005 ap 500',
  'start': 16,
  'end': 27},
 {'entity_group': 'MUN',
  'score': 0.9890807,
  'word': 'fortaleza',
  'start': 28,
  'end': 37},
 {'entity_group': 'UF',
  'score': 0.99000424,
  'word': 'ce',
  'start': 38,
  'end': 40},
 {'entity_group': 'CEP',
  'score': 0.99780333,
  'word': '60050 - 020',
  'start': 41,
  'end': 50}]

In [36]:
token_classifier('sqhn qcn 11 bl 12 cj 20 brasilia df 31000-000')

[{'entity_group': 'LOG',
  'score': 0.98681176,
  'word': 'sqhn qcn',
  'start': 0,
  'end': 8},
 {'entity_group': 'NUM',
  'score': 0.5627875,
  'word': '11 b',
  'start': 9,
  'end': 13},
 {'entity_group': 'LOG',
  'score': 0.69578105,
  'word': '##l',
  'start': 13,
  'end': 14},
 {'entity_group': 'NUM',
  'score': 0.7693461,
  'word': '12 c',
  'start': 15,
  'end': 19},
 {'entity_group': 'LOG',
  'score': 0.5169009,
  'word': '##j',
  'start': 19,
  'end': 20},
 {'entity_group': 'NUM',
  'score': 0.94562596,
  'word': '20',
  'start': 21,
  'end': 23},
 {'entity_group': 'MUN',
  'score': 0.97980535,
  'word': 'brasilia',
  'start': 24,
  'end': 32},
 {'entity_group': 'UF',
  'score': 0.8693732,
  'word': 'df',
  'start': 33,
  'end': 35},
 {'entity_group': 'CEP',
  'score': 0.9980655,
  'word': '31000 - 000',
  'start': 36,
  'end': 45}]

In [37]:
token_classifier('rua vf48 casa 20 goiania go 02000-000')

[{'entity_group': 'LOG',
  'score': 0.9223651,
  'word': 'rua vf48 casa',
  'start': 0,
  'end': 13},
 {'entity_group': 'NUM',
  'score': 0.94974506,
  'word': '20',
  'start': 14,
  'end': 16},
 {'entity_group': 'MUN',
  'score': 0.9872832,
  'word': 'goiania',
  'start': 17,
  'end': 24},
 {'entity_group': 'UF',
  'score': 0.9931039,
  'word': 'go',
  'start': 25,
  'end': 27},
 {'entity_group': 'CEP',
  'score': 0.99790686,
  'word': '02000 - 000',
  'start': 28,
  'end': 37}]

In [38]:
token_classifier('rua vf-48 casa 20 goiania go 02000-000')

[{'entity_group': 'LOG',
  'score': 0.99325407,
  'word': 'rua vf -',
  'start': 0,
  'end': 7},
 {'entity_group': 'NUM',
  'score': 0.6467752,
  'word': '48 casa 20',
  'start': 7,
  'end': 17},
 {'entity_group': 'MUN',
  'score': 0.986787,
  'word': 'goiania',
  'start': 18,
  'end': 25},
 {'entity_group': 'UF',
  'score': 0.9929766,
  'word': 'go',
  'start': 26,
  'end': 28},
 {'entity_group': 'CEP',
  'score': 0.9978073,
  'word': '02000 - 000',
  'start': 29,
  'end': 38}]

In [39]:
!zip -r distilbert-finetune-ner-20230127.zip distilbert-finetuned-ner/checkpoint-225

  adding: distilbert-finetuned-ner/checkpoint-225/ (stored 0%)
  adding: distilbert-finetuned-ner/checkpoint-225/tokenizer_config.json (deflated 41%)
  adding: distilbert-finetuned-ner/checkpoint-225/vocab.txt (deflated 53%)
  adding: distilbert-finetuned-ner/checkpoint-225/training_args.bin (deflated 49%)
  adding: distilbert-finetuned-ner/checkpoint-225/special_tokens_map.json (deflated 42%)
  adding: distilbert-finetuned-ner/checkpoint-225/tokenizer.json (deflated 71%)
  adding: distilbert-finetuned-ner/checkpoint-225/scheduler.pt (deflated 50%)
  adding: distilbert-finetuned-ner/checkpoint-225/rng_state.pth (deflated 23%)
  adding: distilbert-finetuned-ner/checkpoint-225/optimizer.pt (deflated 39%)
  adding: distilbert-finetuned-ner/checkpoint-225/pytorch_model.bin (deflated 8%)
  adding: distilbert-finetuned-ner/checkpoint-225/config.json (deflated 47%)
  adding: distilbert-finetuned-ner/checkpoint-225/trainer_state.json (deflated 65%)


In [40]:
from google.colab import files
files.download('distilbert-finetune-ner-20230127.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>