In [None]:
!pip install tensorflow
!pip install transformers
!pip install seqeval

In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras import metrics, optimizers, losses
from tensorflow.keras.preprocessing.sequence import pad_sequences

from transformers import BertTokenizer, TFBertModel, BertConfig

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from seqeval.metrics import f1_score, classification_report, accuracy_score

In [None]:
BERT_MODEL_NAME = 'bert-base-uncased'

In [None]:
data = pd.read_csv('data/ner_dataset.csv', sep=",", encoding="latin1").fillna(method='ffill')

In [None]:
class ContextNER:

    def __init__(self, df):

        self.__df = df

        self.all_words = set(df.Word.values)
        self.all_tags = set(df.Tag.values)

        self.num_words = len(self.all_words)
        self.num_tags = len(self.all_tags) + 1

        self.sentences = self.__build_sentences()
        self.max_len = self.__get_maxlen()

        self.__build_Xy()
        self.__build_parsers()

    def __get_maxlen(self):
        return max([len(x) for x in self.sentences]) 

    def __build_sentences(self):

        return [x for x in self.__df.groupby('Sentence #').apply(
            lambda xdef: [x for x in zip(
                xdef.Word.values,
                xdef.Tag.values
            )]
        )]

    def __build_Xy(self):

        self.X = [[word for word, __ in value] for value in self.sentences]
        self.y = [[tag for __, tag in value] for value in self.sentences]

    def __build_parsers(self):

        self.word2idx = {value: idx for idx,
                         value in enumerate(self.all_words)}

        # Converte um index em Word
        self.idx2word = {idx: value for value, idx in self.word2idx.items()}

        # Converte Tag em ìndice
        self.tag2idx = {value: idx + 1 for idx,
                        value in enumerate(self.all_tags)}
        self.tag2idx["[PAD]"] = 0  # Padding - Preenchimento

        # Converte index em Tag
        self.idx2tag = {idx: value for value, idx in self.tag2idx.items()}

In [None]:
contextNER = ContextNER(data)

In [None]:
Words, Tags = contextNER.X, contextNER.y

max_seq_length = contextNER.max_len
pad_token_label_id = 0
special_tokens_count =  2


In [None]:

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME, 
                                        do_lower_case=False)

def convert_to_input(sentences, tags):

    input_id_list, attention_mask_list, token_type_id_list = [], [], []
    label_id_list = []
  
    for x, y in tqdm(zip(sentences, tags), total = len(tags)):
  
        tokens = []
        label_ids = []

        for word, label in zip(x, y):
            
            word_tokens = tokenizer.tokenize(word)
            tokens.extend(word_tokens)
            label_ids.extend([contextNER.tag2idx[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
            
        if len(tokens) > max_seq_length - special_tokens_count:
            tokens = tokens[: (max_seq_length - special_tokens_count)]
            label_ids = label_ids[: (max_seq_length - special_tokens_count)]

        label_ids = [pad_token_label_id] + label_ids + [pad_token_label_id]
        inputs = tokenizer.encode_plus(tokens,
                                       add_special_tokens=True, 
                                       truncation=True,
                                       max_length=max_seq_length)

        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        attention_masks = [1] * len(input_ids)

        attention_mask_list.append(attention_masks)
        input_id_list.append(input_ids)
        token_type_id_list.append(token_type_ids)

        label_id_list.append(label_ids)

    return input_id_list, token_type_id_list, attention_mask_list, label_id_list


def pad_seq(seq, max_seq_length):
    return pad_sequences(seq,
                         maxlen=max_seq_length,
                         dtype="long",
                         truncating="post",
                         padding="post")

In [None]:
input_ids_train, token_ids_train, attention_masks_train, label_ids_train = convert_to_input(Words, Tags)

In [None]:
for token_id, tag_id in zip(input_ids_train[0], label_ids_train[0]):
    
    word = tokenizer.convert_ids_to_tokens(token_id)
    tag = contextNER.idx2tag[tag_id]
    
    print(token_id, ' - ', word, ' - ', tag)

In [None]:
input_ids_train = pad_seq(input_ids_train, max_seq_length)
token_ids_train = pad_seq(token_ids_train, max_seq_length)
attention_masks_train = pad_seq(attention_masks_train, max_seq_length)
label_ids_train = pad_seq(label_ids_train, max_seq_length)

# Model

In [None]:
input_ids = layers.Input(shape=(max_seq_length,), 
                         dtype=tf.int32, 
                         name="input_ids")

token_type_ids = layers.Input(shape=(max_seq_length,), 
                              dtype=tf.int32, 
                              name="attention_masks")

attention_masks = layers.Input(shape=(max_seq_length,), 
                               dtype=tf.int32,
                               name="token_type_ids")

bert_inputs = [input_ids, token_type_ids, attention_masks]

bert_configs = BertConfig.from_pretrained(BERT_MODEL_NAME, num_labels=contextNER.num_tags)
bert_model = TFBertModel.from_pretrained(BERT_MODEL_NAME, config=bert_configs)
bert_model.trainable = False

sequence_output = bert_model(bert_inputs)[0]

# Recebe os embedings/features da camada pre-treinada anterior (BERT)

# bi_lstm = layers.Bidirectional(layers.LSTM(max_seq_length // 2, 
#                                            return_sequences=True,
#                                            recurrent_dropout=0.1), name='bilstm')(sequence_output)

# Usar com GPU para acelerar treinamento
bi_lstm = layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(max_seq_length // 2, 
                                                             return_sequences=True),
                                                             name='bilstm')(sequence_output)

dropout = layers.TimeDistributed(layers.Dropout(0.3))(bi_lstm)

dense_layer = layers.TimeDistributed(layers.Dense(max_seq_length,
                                                  activation='relu',
                                                  name='last_dense'))(dropout)

output = layers.Dense(contextNER.num_tags,
                               activation="softmax",
                               name='predictions')(dense_layer)

model = models.Model(inputs=bert_inputs, outputs=output)

model.compile(optimizer=optimizers.Adam(lr=0.0001),
              loss=losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[metrics.SparseCategoricalAccuracy('accuracy')])

In [None]:
model.summary()

In [None]:
x_train = [input_ids_train,
           attention_masks_train,
           token_ids_train]

In [None]:
history = model.fit(x=x_train,
                    y=label_ids_train,
                    validation_split=0.3, 
                    batch_size=16, 
                    epochs=3) 

In [None]:
history.history