In [3]:
# boilerplate
import os
import sys
from pathlib import Path

path = Path(os.getcwd())
sys.path.append(str(path.parent))

In [4]:
import numpy as np
import tensorflow as tf

from keras.callbacks import ModelCheckpoint
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, TFAutoModelForTokenClassification

from preprocessing import SentencesGenerate

In [5]:
PATH_DEV = '../../dataset/dev.txt'
PATH_TEST = '../../dataset/test.txt'
PATH_TRAIN = '../../dataset/train.txt'

MIN_LENGTH = 3
MAX_LEN_SENT = 128

### Carga de los datos

In [6]:
train = SentencesGenerate(path=PATH_TRAIN, min_length=MIN_LENGTH)
test = SentencesGenerate(path=PATH_TEST, min_length=MIN_LENGTH)
dev = SentencesGenerate(path=PATH_DEV, min_length=MIN_LENGTH)

In [7]:
checkpoint = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [8]:
# vocabulary comparison on the training and test set
vocab = {word for word in tokenizer.vocab.keys()}
len(train.vocab & vocab) / len(train.vocab | vocab)

0.22350546389543605

### Preparación de los datos

In [10]:
encoded_inputs_train = dict(tokenizer(train.X,
                                      padding=True,
                                      truncation=True,
                                      max_length=MAX_LEN_SENT,
                                      is_split_into_words=True,
                                      return_offsets_mapping=True,
                                      return_tensors="tf"))

encoded_inputs_test = dict(tokenizer(test.X,
                                     padding=True,
                                     truncation=True,
                                     max_length=MAX_LEN_SENT,
                                     is_split_into_words=True,
                                     return_offsets_mapping=True,
                                     return_tensors="tf"))

In [11]:
tag2id = {tag: id for id, tag in enumerate(train.labels)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [12]:
tag2id

{'I-LOC': 0,
 'B-LOC': 1,
 'B-MISC': 2,
 'B-ORG': 3,
 'I-ORG': 4,
 'I-MISC': 5,
 'B-PER': 6,
 'O': 7,
 'I-PER': 8}

### Problema de alinamiento de *X* con respecto a *y*

Por como son representadas las oraciones por el modelo, al llamar a *tokenizer()* este a cada oración agrega *Tokens* especiales, por ejemplo al inicio de cada oración se le agrega el *Token* especial *\[CLS\]* y luego al final de la misma el *Token* especial *\[SEP\]* y finalmente el *Token* especial *\[PAD\]* de ser necesario.
Esto como consecuencia genera que los *Tokens* queden desalineados con respecto a sus etiquetas, por lo que hay que alinearlos. (La siguiente función es una posible solución al problema pero no la única)

In [13]:
# source: https://huggingface.co/transformers/custom_datasets.html

def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in sent] for sent in tags]
    encoded_labels = []
    for idx, (doc_labels, doc_offset) in enumerate(zip(labels, encodings['offset_mapping'])):       
        # create an empty array of id tag O of max len sequences
        doc_enc_labels = np.ones(len(doc_offset), dtype=int) * tag2id['O']
        arr_offset = np.array(doc_offset)
        # set labels whose first offset position is 0 and the second is not 0
        n = len(doc_enc_labels[(arr_offset[:, 0] == 0) & (arr_offset[:, 1] != 0)])
        doc_enc_labels[(arr_offset[:, 0] == 0) & (arr_offset[:, 1] != 0)] = doc_labels[:n]
        encoded_labels.append(doc_enc_labels.tolist())
    return encoded_labels

train_labels = encode_tags(train.y, encoded_inputs_train)
test_labels = encode_tags(test.y, encoded_inputs_test)

In [14]:
print(test[0][0])
print(tokenizer.tokenize(tokenizer.decode(encoded_inputs_test['input_ids'][0][:15])))
print(test[0][1])
print(test_labels[0][:15])

['La', 'Coruña', ',', '23', 'may', '(', 'EFECOM', ')', '.']
['[CLS]', 'La', 'Coruña', ',', '23', 'may', '(', 'EF', '##EC', '##OM', ')', '.', '[SEP]', '[PAD]', '[PAD]']
['B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O']
[7, 1, 0, 7, 7, 7, 7, 3, 7, 7, 7, 7, 7, 7, 7]


### Carga del modelo BETO para clasificación de *Tokens*

In [15]:
model = TFAutoModelForTokenClassification.from_pretrained(checkpoint,
                                                          label2id=tag2id,
                                                          id2label=id2tag,
                                                          num_labels=len(train.labels))

All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
encoded_inputs_train.pop("offset_mapping")
encoded_inputs_test.pop('offset_mapping')

<tf.Tensor: shape=(1349, 128, 2), dtype=int32, numpy=
array([[[ 0,  0],
        [ 0,  2],
        [ 0,  6],
        ...,
        [ 0,  0],
        [ 0,  0],
        [ 0,  0]],

       [[ 0,  0],
        [ 0,  3],
        [ 0,  8],
        ...,
        [ 0,  0],
        [ 0,  0],
        [ 0,  0]],

       [[ 0,  0],
        [ 0,  6],
        [ 0,  4],
        ...,
        [ 0,  0],
        [ 0,  0],
        [ 0,  0]],

       ...,

       [[ 0,  0],
        [ 0,  2],
        [ 0, 10],
        ...,
        [ 0,  0],
        [ 0,  0],
        [ 0,  0]],

       [[ 0,  0],
        [ 0,  2],
        [ 0,  3],
        ...,
        [ 0,  0],
        [ 0,  0],
        [ 0,  0]],

       [[ 0,  0],
        [ 0,  3],
        [ 0, 11],
        ...,
        [ 0,  0],
        [ 0,  0],
        [ 0,  0]]], dtype=int32)>

In [17]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(encoded_inputs_train),
                                                    train_labels))

### Definición de optimizador, función de costo y *Finetuning* del model

In [18]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss)
model.fit(train_dataset.shuffle(1000).batch(8), epochs=1, batch_size=16)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
  return py_builtins.overload_of(f)(*args)


<tensorflow.python.keras.callbacks.History at 0x7fbf2efcc820>

In [19]:
model.config.id2label

{0: 'I-LOC',
 1: 'B-LOC',
 2: 'B-MISC',
 3: 'B-ORG',
 4: 'I-ORG',
 5: 'I-MISC',
 6: 'B-PER',
 7: 'O',
 8: 'I-PER'}

### Predicción

In [20]:
y_predict = model.predict(dict(encoded_inputs_test))
softmax = tf.nn.softmax(y_predict.logits)



In [24]:
predict = [list(map(lambda x: id2tag[np.argmax(x)], sent)) for sent in softmax]
actual = [[id2tag[label] for label in sent] for sent in test_labels]

In [29]:
# Joint for metric
y_predict, y_actual = [], []
for a, p in zip(actual, predict):
    y_actual.extend(a)
    y_predict.extend(p)
    assert len(y_predict) == len(y_actual)

### Métricas

In [30]:
print(classification_report(y_actual, y_predict))

              precision    recall  f1-score   support

       B-LOC       0.88      0.81      0.85      1070
      B-MISC       0.77      0.82      0.79       339
       B-ORG       0.88      0.86      0.87      1395
       B-PER       0.94      0.97      0.95       719
       I-LOC       0.85      0.82      0.83       325
      I-MISC       0.88      0.87      0.88       557
       I-ORG       0.92      0.86      0.89      1104
       I-PER       0.94      0.98      0.96       616
           O       1.00      1.00      1.00    166547

    accuracy                           0.99    172672
   macro avg       0.89      0.89      0.89    172672
weighted avg       0.99      0.99      0.99    172672



In [None]:
# model.save_weights('beto')
# model = tf.keras.models.load_model('./model')


### Ejemplo predicción

In [27]:
for token, a, p in zip(tokenizer.tokenize(tokenizer.decode(encoded_inputs_test['input_ids'][0])),
                       actual[0],
                       predict[0]):
    print(f'{token} - {a} - {p}')

[CLS] - O - O
La - B-LOC - B-LOC
Coruña - I-LOC - I-LOC
, - O - O
23 - O - O
may - O - O
( - O - O
EF - B-ORG - B-ORG
##EC - O - O
##OM - O - O
) - O - O
. - O - O
[SEP] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O - O
[PAD] - O 