In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from seqeval.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score

from tensorflow.keras import layers, metrics, optimizers, losses, Model

from crf_layer import CRF
from preprocessing.contextNER import ContextNER
from time import sleep
from tqdm import tqdm

In [5]:
data = pd.read_csv('../data/custom/DATA_TWEETS_VACINAS_PT_BR.csv')# sep=",", encoding="latin1").fillna(method='ffill')

In [6]:
data

Unnamed: 0,Sentence,Word,Tag
0,Sentence #0,A,O
1,Sentence #0,terceira,O
2,Sentence #0,dose,O
3,Sentence #0,da,O
4,Sentence #0,vacina,O
...,...,...,...
886079,Sentence #69467,vacina,O
886080,Sentence #69467,.O,O
886081,Sentence #69467,Governo,O
886082,Sentence #69467,…,O


In [7]:
data_ner = ContextNER(data, groupby='Sentence')

In [4]:
class ContextNER:

    def __init__(self, df):

        self.__df = df

        self.all_words = set(df.Word.values)
        self.all_tags = set(df.Tag.values)

        self.num_words = len(self.all_words)
        self.num_tags = len(self.all_tags)

        self.sentences = self.__build_sentences()
        self.max_len = self.__get_maxlen()

        self.__build_Xy()
        self.__build_parsers()

    def __get_maxlen(self):
        return max([len(x) for x in self.sentences]) 

    def __build_sentences(self):

        return [x for x in self.__df.groupby('Sentence #').apply(
            lambda xdef: [x for x in zip(
                xdef.Word.values,
                xdef.Tag.values
            )]
        )]

    def __build_Xy(self):

        self.X = [[word for word, __ in value] for value in self.sentences]
        self.y = [[tag for __, tag in value] for value in self.sentences]

    def __build_parsers(self):
        
        self.tag2idx = {value: idx for idx, value in enumerate(self.all_tags)}
        self.idx2tag = {idx: value for value, idx in self.tag2idx.items()}

In [8]:
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST  = train_test_split(data_ner.X_array,
                                                     data_ner.y_array,
                                                     random_state=42,
                                                     test_size=0.3)

In [9]:
X_TRAIN.shape, X_TEST.shape

((31325, 66), (13425, 66))

In [10]:
def all_metrics(pred_tag, true_tag):

    print(classification_report(pred_tag, true_tag))
    print('=' * 25)
    print("Precision: \t", precision_score(pred_tag, true_tag))
    print("Recall: \t", recall_score(pred_tag, true_tag))
    print("F1: \t\t", f1_score(pred_tag, true_tag))
    
def build_matrix_embeddings(path, num_tokens, embedding_dim, word_index):
    """
        Função para carregar arquivos pre-treinados em memória
    """

    hits, misses = 0, 0
    embeddings_index = {}

    print('Loading file...')

    sleep(0.5)

    for line in tqdm(open(path, encoding='utf-8')):
        word, coefs = line.split(maxsplit=1)
        embeddings_index[word] = np.fromstring(coefs, "f", sep=" ")

    print("Encontrado %s Word Vectors." % len(embeddings_index))

    sleep(0.5)

    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))

    for word, i in tqdm(word_index.items()):
        if i >= num_tokens:
            continue
        try:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                hits += 1
            else:
                embedding_vector = embeddings_index.get(str(word).lower())
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
                    hits += 1
                else:
                    embedding_vector = embeddings_index.get(str(word).upper())
                    if embedding_vector is not None:
                        embedding_matrix[i] = embedding_vector
                        hits += 1
                misses += 1
        except:
            embedding_matrix[i] = embeddings_index.get('UNK')

    print("Convertidos: %d Tokens | Perdidos: %d Tokens" % (hits, misses))

    return embedding_matrix

# *Custom Model*

In [11]:
class NER_MODEL(Model):
    
    def __init__(self, 
                 configs_ner_params=None, 
                 dropout_rate=0.3,
                 embeddings=None,
                 use_crf=False,
                 hiden_units=256):
        
        super(NER_MODEL, self).__init__()

        self.configs_ner_params = configs_ner_params
        self.hiden_units = hiden_units
        self.use_crf = use_crf

        self.embedding = layers.Embedding(input_length=configs_ner_params.max_len, 
                                          input_dim=embeddings.shape[0],
                                          output_dim=embeddings.shape[1],
                                          weights=[embeddings],
                                          trainable=True)
     
         #         self.embedding = layers.Embedding(input_dim=configs_ner_params.num_words,
         #                                           output_dim=configs_ner_params.max_len,
         #                                           input_length=configs_ner_params.max_len)

        self.dropout = layers.TimeDistributed(layers.Dropout(dropout_rate))
            
        self.bilstm = layers.Bidirectional(layers.LSTM(units=hiden_units // 2,
                                                       return_sequences=True,
                                                       recurrent_dropout=0.1))
        
        self.dense_crf = layers.TimeDistributed(layers.Dense(units=configs_ner_params.num_tags, 
                                                             activation='relu'))

        self.classifier_crf = CRF(configs_ner_params.num_tags, sparse_target=True)

        self.dense = layers.TimeDistributed(layers.Dense(units=self.hiden_units, 
                                                         activation='relu'))

        self.classifier_softmax = layers.TimeDistributed(layers.Dense(units=configs_ner_params.num_tags, 
                                                                      activation='softmax'))

    def call(self, inputs):

        X = self.embedding(inputs)
        X = self.dropout(X)
        X = self.bilstm(X)
        
        if self.use_crf:
            
            X = self.dense_crf(X)
            X = self.classifier_crf(X)
        else:
            
            X = self.dense(X)
            X = self.classifier_softmax(X)
            
        return X
        
    def model(self):
        
        """
            Implementação baseada na API Funcional, permite utilizar o método 'model.summary' 
            antes de executar o método 'model.fit'
            
            Retorna o modelo compilado com base no classificador CRF / SOFTMAX
            
            parâmetro: use_crf
        """
        
        X = layers.Input(shape=(self.configs_ner_params.max_len,))

        X = Model(inputs=[X], outputs=self.call(X))
        
        optm = optimizers.Adam(learning_rate=0.001)

        if self.use_crf:
            X.compile(optimizer=optm,
                      loss=[self.classifier_crf.loss],
                      metrics=[self.classifier_crf.accuracy])
        else:
            X.compile(optimizer=optm,
                      loss=losses.CategoricalCrossentropy(),
                      metrics=metrics.CategoricalAccuracy('accuracy'))
        return X

In [12]:
%%time
file_path = './crawl-300d-2M.vec'

glove_embeddings = \
build_matrix_embeddings(path=file_path,
                        num_tokens=data_ner.num_words, 
                        embedding_dim=300, 
                        word_index=data_ner.word2idx)

Loading file...


1999996it [02:05, 15988.51it/s]


Encontrado 1999996 Word Vectors.


100%|██████████| 93220/93220 [00:00<00:00, 298128.18it/s]

Convertidos: 23816 Tokens | Perdidos: 72067 Tokens
CPU times: user 2min, sys: 5.64 s, total: 2min 6s
Wall time: 2min 7s





In [13]:
model = NER_MODEL(data_ner, 
                  embeddings=glove_embeddings,
                  use_crf=True).model()

In [14]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 66)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 66, 300)           27966000  
_________________________________________________________________
time_distributed (TimeDistri (None, 66, 300)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 66, 256)           439296    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 66, 10)            2570      
_________________________________________________________________
crf (CRF)                    (None, 66, 10)            100       
Total params: 28,407,966
Trainable params: 28,407,966
Non-trainable params: 0
_________________________________________________

In [15]:
%%time

History = model.fit(X_TRAIN,
                    Y_TRAIN,
                    validation_split=0.1,
                    batch_size=64, 
                    epochs=15)

Epoch 1/15
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
CPU times: user 7h 48min 4s, sys: 1h 26min 34s, total: 9h 14min 39s
Wall time: 1h 14min 41s


In [16]:
preds = model.predict(X_TEST, verbose=1, batch_size=64)
                      
y_pred, y_true = \
np.argmax(preds, axis=-1), \
np.argmax(Y_TEST, -1)

pred_tag, true_tag = \
data_ner.parser2categorical(y_pred, y_true)



In [17]:
all_metrics(pred_tag, true_tag)



              precision    recall  f1-score   support

          AD       1.00      1.00      1.00     13425
         LOC       0.58      0.67      0.62      6521
        MISC       0.36      0.41      0.39      4523
         ORG       0.46      0.46      0.46      3294
         PER       0.61      0.64      0.63      5686

   micro avg       0.70      0.74      0.72     33449
   macro avg       0.60      0.64      0.62     33449
weighted avg       0.71      0.74      0.73     33449

Precision: 	 0.7007953968522592
Recall: 	 0.7428024754103262
F1: 		 0.7211877567014499
