In [1]:
# !pip install tensorflow
# !pip install transformers
# !pip install seqeval



In [2]:
import os

In [3]:
os.environ['KAGGLE_USERNAME'] = "xwalker" # Usuário Kaggle
os.environ['KAGGLE_KEY'] = "799d9818e9349a3dd767276d469df34a" # Token de Acesso

!kaggle datasets download -d abhinavwalia95/entity-annotated-corpus

!unzip entity-annotated-corpus.zip

entity-annotated-corpus.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  entity-annotated-corpus.zip
replace ner.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ner.csv                 
replace ner_dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: ner_dataset.csv         


In [26]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras import metrics, optimizers, losses
from tensorflow.keras.preprocessing.sequence import pad_sequences

from transformers import BertTokenizer, TFBertModel, BertConfig

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from seqeval.metrics import f1_score, classification_report, accuracy_score

In [5]:
BERT_MODEL_NAME = 'bert-base-uncased'

In [6]:
data = pd.read_csv('ner_dataset.csv', sep=",", encoding="latin1").fillna(method='ffill')

In [31]:
class ContextNER:

    def __init__(self, df):

        self.__df = df

        self.all_words = set(df.Word.values)
        self.all_tags = set(df.Tag.values)

        self.num_words = len(self.all_words)
        self.num_tags = len(self.all_tags) + 1

        self.sentences = self.__build_sentences()
        self.max_len = self.__get_maxlen()

        self.__build_Xy()
        self.__build_parsers()

    def __get_maxlen(self):
        return max([len(x) for x in self.sentences]) 

    def __build_sentences(self):

        return [x for x in self.__df.groupby('Sentence #').apply(
            lambda xdef: [x for x in zip(
                xdef.Word.values,
                xdef.Tag.values
            )]
        )]

    def __build_Xy(self):

        self.X = [[word for word, __ in value] for value in self.sentences]
        self.y = [[tag for __, tag in value] for value in self.sentences]

    def __build_parsers(self):

        self.word2idx = {value: idx for idx,
                         value in enumerate(self.all_words)}

        # Converte um index em Word
        self.idx2word = {idx: value for value, idx in self.word2idx.items()}

        # Converte Tag em ìndice
        self.tag2idx = {value: idx + 1 for idx,
                        value in enumerate(self.all_tags)}
        self.tag2idx["[PAD]"] = 0  # Padding - Preenchimento

        # Converte index em Tag
        self.idx2tag = {idx: value for value, idx in self.tag2idx.items()}

In [32]:
contextNER = ContextNER(data)

In [9]:
Words, Tags = contextNER.X, contextNER.y

In [39]:
def __bert_encode(texts, tokenizer, max_len=None):
    
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len - 2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [40]:
max_seq_length = contextNER.max_len

pad_token_label_id = 0
special_tokens_count =  2


tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME, 
                                        do_lower_case=False)

def convert_to_input(sentences, tags):

    input_id_list, attention_mask_list, token_type_id_list = [], [], []
    label_id_list = []
  
    for x, y in tqdm(zip(sentences, tags), total = len(tags)):
  
        tokens = []
        label_ids = []

        for word, label in zip(x, y):
            
            word_tokens = tokenizer.tokenize(word)
            tokens.extend(word_tokens)
            label_ids.extend([contextNER.tag2idx[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
            
        if len(tokens) > max_seq_length - special_tokens_count:
            tokens = tokens[: (max_seq_length - special_tokens_count)]
            label_ids = label_ids[: (max_seq_length - special_tokens_count)]

        label_ids = [pad_token_label_id] + label_ids + [pad_token_label_id]
        inputs = tokenizer.encode_plus(tokens,
                                       add_special_tokens=True, 
                                       truncation=True,
                                       max_length=max_seq_length)

        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        attention_masks = [1] * len(input_ids)

        attention_mask_list.append(attention_masks)
        input_id_list.append(input_ids)
        token_type_id_list.append(token_type_ids)

        label_id_list.append(label_ids)

    return input_id_list, token_type_id_list, attention_mask_list, label_id_list


def pad_seq(seq, max_seq_length):
    return pad_sequences(seq,
                         maxlen=max_seq_length,
                         dtype="long",
                         truncating="post",
                         padding="post")

In [41]:
input_ids_train, token_ids_train, attention_masks_train, label_ids_train = convert_to_input(Words, Tags)

100%|██████████| 47959/47959 [01:09<00:00, 685.49it/s]


In [42]:
for token_id, tag_id in zip(input_ids_train[0], label_ids_train[0]):
    
    word = tokenizer.convert_ids_to_tokens(token_id)
    tag = contextNER.idx2tag[tag_id]
    
    print(token_id, ' - ', word, ' - ', tag)

101  -  [CLS]  -  [PAD]
100  -  [UNK]  -  O
1997  -  of  -  O
28337  -  demonstrators  -  O
2031  -  have  -  O
9847  -  marched  -  O
2083  -  through  -  O
100  -  [UNK]  -  B-geo
2000  -  to  -  O
6186  -  protest  -  O
1996  -  the  -  O
2162  -  war  -  O
1999  -  in  -  O
100  -  [UNK]  -  B-geo
1998  -  and  -  O
5157  -  demand  -  O
1996  -  the  -  O
10534  -  withdrawal  -  O
1997  -  of  -  O
100  -  [UNK]  -  B-gpe
3629  -  troops  -  O
2013  -  from  -  O
2008  -  that  -  O
2406  -  country  -  O
1012  -  .  -  O
102  -  [SEP]  -  [PAD]


In [43]:
input_ids_train = pad_seq(input_ids_train, max_seq_length)
token_ids_train = pad_seq(token_ids_train, max_seq_length)
attention_masks_train = pad_seq(attention_masks_train, max_seq_length)
label_ids_train = pad_seq(label_ids_train, max_seq_length)

# Model

In [44]:
input_ids = layers.Input(shape=(max_seq_length,), 
                         dtype=tf.int32, 
                         name="input_ids")

token_type_ids = layers.Input(shape=(max_seq_length,), 
                              dtype=tf.int32, 
                              name="attention_masks")

attention_masks = layers.Input(shape=(max_seq_length,), 
                               dtype=tf.int32,
                               name="token_type_ids")

bert_inputs = [input_ids, token_type_ids, attention_masks]

bert_configs = BertConfig.from_pretrained(BERT_MODEL_NAME, num_labels=contextNER.num_tags)
bert_model = TFBertModel.from_pretrained(BERT_MODEL_NAME, config=bert_configs)
bert_model.trainable = False

sequence_output = bert_model(bert_inputs)[0]

# Recebe os embedings/features da camada pre-treinada anterior (BERT)

# bi_lstm = layers.Bidirectional(layers.LSTM(max_seq_length // 2, 
#                                            return_sequences=True,
#                                            recurrent_dropout=0.1), name='bilstm')(sequence_output)

# Usar com GPU para acelerar treinamento
bi_lstm = layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(max_seq_length // 2, 
                                                             return_sequences=True),
                                                             name='bilstm')(sequence_output)

dropout = layers.TimeDistributed(layers.Dropout(0.3))(bi_lstm)

dense_layer = layers.TimeDistributed(layers.Dense(max_seq_length,
                                                  activation='relu',
                                                  name='last_dense'))(dropout)

output = layers.Dense(contextNER.num_tags,
                               activation="softmax",
                               name='predictions')(dense_layer)

model = models.Model(inputs=bert_inputs, outputs=output)

model.compile(optimizer=optimizers.Adam(lr=0.0001),
              loss=losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[metrics.SparseCategoricalAccuracy('accuracy')])

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.




In [45]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 104)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 104)]        0                                            
__________________________________________________________________________________________________
attention_masks (InputLayer)    [(None, 104)]        0                                            
__________________________________________________________________________________________________
tf_bert_model_3 (TFBertModel)   TFBaseModelOutputWit 109482240   input_ids[0][0]                  
                                                                 token_type_ids[0][0]       

In [46]:
x_train = [input_ids_train,
           attention_masks_train,
           token_ids_train]

In [None]:
history = model.fit(x=x_train,
                    y=label_ids_train,
                    validation_split=0.3, 
                    batch_size=16, 
                    epochs=3) 

Epoch 1/3


In [None]:
history.history

In [None]:
model.