In [1]:
# boilerplate
import os
import sys
from pathlib import Path

path = Path(os.getcwd())
sys.path.append(str(path.parent))

In [2]:
import numpy as np
import tensorflow as tf

from keras.models import Input, Model
from sklearn.metrics import classification_report
from keras.preprocessing.sequence import pad_sequences

from bilstm import BiLSTM
from preprocessing import SentencesGenerate

In [3]:
PATH_DEV = '../../dataset/dev.txt'
PATH_TEST = '../../dataset/test.txt'
PATH_TRAIN = '../../dataset/train.txt'

MIN_LENGTH = 3
MAX_LEN_SENT = 128
MAX_LEN_CHAR = 20

### Carga de los datos

In [4]:
dev = SentencesGenerate(path=PATH_DEV, min_length=MIN_LENGTH)
test = SentencesGenerate(path=PATH_TEST, min_length=MIN_LENGTH)
train = SentencesGenerate(path=PATH_TRAIN, min_length=MIN_LENGTH)

### Preparación de los datos

In [5]:
def convert2idx(words, chars, tags):
    
    word2idx = {w: i for i, w in enumerate(words, 2)}
    word2idx['[PAD]'] = 0
    word2idx['[UNK]'] = 1

    char2idx = {c: i for i, c in enumerate(chars, 2)}
    char2idx['[PAD]'] = 0
    char2idx['[UNK]'] = 1

    # 'O' Tag by [PAD]
    tag2idx = {t: i for i, t in enumerate(tags)}

    return word2idx, char2idx, tag2idx

def get_word_input(sentences, max_len, word2idx):
    X = [[word2idx.get(word, 1) for word in sent] for sent in sentences]
    X = pad_sequences(sequences=X, maxlen=max_len, padding='post', value=word2idx['[PAD]'])
    return X

def get_y(sentences, max_len, tag2idx):
    y = [[tag2idx[tag] for tag in sent] for sent in sentences]
    y = pad_sequences(sequences=y, maxlen=max_len, padding='post', value=tag2idx['O'])
    y = [tf.keras.utils.to_categorical(sent, num_classes=len(tag2idx)) for sent in y]
    return np.array(y)

def get_char_input(sentences, max_len_sent, max_len_char, char2idx):
    X = []
    for sent in sentences:
        words_sequence = []
        for w in range(max_len_sent):
            chars_sequence = []
            for c in range(max_len_char):
                try:
                    chars_sequence.append(char2idx.get(sent[w][c], 1))
                except:
                    chars_sequence.append(char2idx.get('[PAD]'))
            words_sequence.append(chars_sequence)
        X.append(np.array(words_sequence))
    return np.array(X)

words_train = train.vocab
chars_train = list(set(''.join(words_train)))
labels = train.labels

word2idx, char2idx, tag2idx = convert2idx(words_train, chars_train, labels)

# trainining set
X_words_train = get_word_input(train.X, MAX_LEN_SENT, word2idx)
X_chars_train = get_char_input(train.X, MAX_LEN_SENT, MAX_LEN_CHAR, char2idx)
y_train = get_y(train.y, MAX_LEN_SENT, tag2idx)

In [6]:
tag2idx

{'I-ORG': 0,
 'O': 1,
 'B-LOC': 2,
 'I-LOC': 3,
 'I-MISC': 4,
 'B-MISC': 5,
 'B-ORG': 6,
 'B-PER': 7,
 'I-PER': 8}

In [7]:
# vocabulary comparison on the training and test set 
len(train.vocab & test.vocab) / len(train.vocab | test.vocab)

0.23287431234451103

In [8]:
print(X_words_train.shape, X_chars_train.shape, y_train.shape)

(7118, 128) (7118, 128, 20) (7118, 128, 9)


### Definición del modelo BiLSTM-CNN

In [19]:
model = BiLSTM(len(word2idx), len(char2idx), len(train.labels))

# inputs
word_input = Input(shape=(MAX_LEN_SENT,))
char_input = Input(shape=(MAX_LEN_SENT, MAX_LEN_CHAR))

output = model([char_input, word_input])
model = Model(inputs=[char_input, word_input], outputs=output)


In [21]:
model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            [(None, 128, 20)]    0                                            
__________________________________________________________________________________________________
time_distributed_6 (TimeDistrib (None, 128, 20, 15)  1395        input_8[0][0]                    
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
time_distributed_7 (TimeDistrib (None, 128, 18, 15)  690         time_distributed_6[0][0]         
______________________________________________________________________________________________

### Definición de optimizador, función de costo y entrenamiento

In [22]:
# fit
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(x=[X_chars_train, X_words_train], y=y_train, batch_size=32, epochs=3, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f037406b280>

In [23]:
# test set
X_words_test = get_word_input(test.X, MAX_LEN_SENT, word2idx)
X_chars_test = get_char_input(test.X, MAX_LEN_SENT, MAX_LEN_CHAR, char2idx)
y_test = get_y(test.y, MAX_LEN_SENT, tag2idx)

### Predicción

In [24]:
y_predict = model.predict([X_chars_test, X_words_test])

In [25]:
idx2tag = {i: tag for tag, i in tag2idx.items()}
y_predict_bio = [list(map(lambda x: idx2tag[np.argmax(x)], sent)) for sent in y_predict]
y_test_bio = [list(map(lambda x: idx2tag[np.argmax(x)], sent)) for sent in y_test]

In [26]:
y_predict, y_actual = [], []
for a, p in zip(y_test_bio, y_predict_bio):
    y_actual.extend(a)
    y_predict.extend(p)
    assert len(y_predict) == len(y_actual)

### Métricas

In [27]:
print(classification_report(all_actual, all_predict))

              precision    recall  f1-score   support

       B-LOC       0.80      0.75      0.77      1077
      B-MISC       0.54      0.58      0.56       339
       B-ORG       0.85      0.80      0.83      1399
       B-PER       0.91      0.76      0.83       727
       I-LOC       0.61      0.70      0.65       325
      I-MISC       0.62      0.54      0.58       557
       I-ORG       0.89      0.75      0.81      1104
       I-PER       0.96      0.88      0.92       626
           O       1.00      1.00      1.00    166518

    accuracy                           0.99    172672
   macro avg       0.80      0.75      0.77    172672
weighted avg       0.99      0.99      0.99    172672

