In [2]:
!pip install transformers
!pip install tokenizers
!pip install tensorflow -U

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 13.5MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 38.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 46.9MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=d5af

In [3]:
import os
import re
import json
import string
import numpy as np
import pandas as pd

import tensorflow as tf

from tensorflow.keras import layers, models
from tensorflow.keras import metrics, optimizers, losses
from tensorflow import keras
from tensorflow.keras import layers
from sklearn import preprocessing
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig

In [4]:
import os

os.environ['KAGGLE_USERNAME'] = "" # Usuário Kaggle
os.environ['KAGGLE_KEY'] = "" # Token de Acesso

!kaggle datasets download -d abhinavwalia95/entity-annotated-corpus

!unzip entity-annotated-corpus.zip

Downloading entity-annotated-corpus.zip to /content
 76% 20.0M/26.4M [00:00<00:00, 51.1MB/s]
100% 26.4M/26.4M [00:00<00:00, 76.0MB/s]
Archive:  entity-annotated-corpus.zip
  inflating: ner.csv                 
  inflating: ner_dataset.csv         


In [5]:
max_len = 130
configuration = BertConfig()
data_csv = "/content/ner_dataset.csv"

slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)

slow_tokenizer.save_pretrained(save_path)

tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False,
                                                            reduction=tf.keras.losses.Reduction.NONE)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [6]:
def masked_ce_loss(real, pred):
    
    mask = tf.math.logical_not(tf.math.equal(real, 18))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

def create_model(num_tags):
    
    encoder = TFBertModel.from_pretrained("bert-base-uncased")
    
    encoder.trainable = False
 
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    
    inputs_berts = [input_ids, token_type_ids, attention_mask]
    
    embedding = encoder(inputs_berts)[0]
    
    bi_lstm = layers.Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(max_len // 2, 
                                                             return_sequences=True),
                                                             name='bilstm')(embedding)

    dropout = layers.TimeDistributed(layers.Dropout(0.3))(bi_lstm)

    dense_layer = layers.TimeDistributed(layers.Dense(max_len,
                                                      activation='relu',
                                                      name='last_dense'))(dropout)

    output = layers.Dense(num_tags + 1,
                          activation="softmax",
                          name='predictions')(dense_layer)

    model = models.Model(inputs=inputs_berts, outputs=output)

    model.compile(optimizer=optimizers.Adam(lr=0.001),
                  loss=[masked_ce_loss],
                  metrics=[metrics.SparseCategoricalAccuracy('accuracy')])
    
    return model

def process_csv(data_path):

    df = pd.read_csv(data_path, encoding="latin-1")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")

    enc_tag = preprocessing.LabelEncoder()

    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])

    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    tag = df.groupby("Sentence #")["Tag"].apply(list).values

    return sentences, tag, enc_tag


def create_inputs_targets(data_csv):

    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "tags": []
    }

    sentences, tags, tag_encoder = process_csv(data_csv)
    
    for sentence, tag in zip(sentences, tags):

        input_ids = []
        target_tags = []

        for idx, word in enumerate(sentence):

            ids = tokenizer.encode(word, add_special_tokens=False)
            input_ids.extend(ids.ids)
            num_tokens = len(ids)
            target_tags.extend([tag[idx]] * num_tokens)
        
        
        # Pad truncate
        input_ids = input_ids[:max_len - 2]
        target_tags = target_tags[:max_len - 2]

        input_ids = [101] + input_ids + [102]
        target_tags = [16] + target_tags + [16]
        token_type_ids = [0] * len(input_ids)
        attention_mask = [1] * len(input_ids)
        padding_len = max_len - len(input_ids)

        input_ids = input_ids + ([0] * padding_len)
        attention_mask = attention_mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_tags = target_tags + ([17] * padding_len)
        
        dataset_dict["input_ids"].append(input_ids)
        dataset_dict["token_type_ids"].append(token_type_ids)
        dataset_dict["attention_mask"].append(attention_mask)
        dataset_dict["tags"].append(target_tags)
        
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]

    y = dataset_dict["tags"]

    return x, y, tag_encoder

In [30]:
num_tags = pd.read_csv(data_csv, encoding="latin-1")["Tag"].nunique()

x_train, y_train, tag_encoder = create_inputs_targets(data_csv)

In [7]:
model = create_model(num_tags)
    
model.summary()

model.fit(
    x_train,
    y_train,
    epochs=10,
    verbose=1,
    batch_size=16,
    validation_split=0.3
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 130)]        0                                            
__________________________________________

<tensorflow.python.keras.callbacks.History at 0x7fa2f578c890>

In [8]:
def create_test_input_from_text(texts):
    
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": []
    }
    
    for sentence in texts:
        
        input_ids = []
        
        for idx, word in enumerate(sentence.split()):
            ids = tokenizer.encode(word, add_special_tokens=False)
            input_ids.extend(ids.ids)
            num_tokens = len(ids)
            
        input_ids = input_ids[:max_len - 2]

        input_ids = [101] + input_ids + [102]
        n_tokens = len(input_ids)
        token_type_ids = [0] * len(input_ids)
        attention_mask = [1] * len(input_ids)
        padding_len = max_len - len(input_ids)

        input_ids = input_ids + ([0] * padding_len)
        attention_mask = attention_mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        
        dataset_dict["input_ids"].append(input_ids)
        dataset_dict["token_type_ids"].append(token_type_ids)
        dataset_dict["attention_mask"].append(attention_mask)
        
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    
    return x, n_tokens

# Prevendo novas frases

In [58]:
test_inputs = ["Mateus Garcia lives in Picos Piauí, Brazil, alone and also has a beautiful little house in Crato of Ceará. in the next month he will work at the new company Brisanet Telecom how to develop software during this corona virus pandemic"]

In [66]:
x_test, n_tokens = create_test_input_from_text(test_inputs)
pred_test = model.predict(x_test)
pred_tags = np.argmax(pred_test,2)[0] 

le_dict = dict(zip(tag_encoder.transform(tag_encoder.classes_), tag_encoder.classes_))

In [67]:
print(x_test[0])
print("-" * 75)
print(pred_tags)

[[  101  6775  2271  7439  3268  1999 27263  2891 24624 10179  1010  4380
   1010  2894  1998  2036  2038  1037  3376  2210  2160  1999 13675 10610
   1997  8292  5400  1012  1999  1996  2279  3204  2002  2097  2147  2012
   1996  2047  2194  7987 29196  3388 18126  2129  2000  4503  4007  2076
   2023 21887  7865  6090  3207  7712   102     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0]]
---------------------------------------------------------------------------
[16  6  6 14 16 16  2  2 10 10 16  2 16 16 16 16 16 16 16 16 16 16  2  2
 16  2  2 16 16 16 16 16 16 16 16 16 16 16 16  

In [68]:
tokens = [tokenizer.id_to_token(x) for x in (x_test[0][0])]
preds = [le_dict.get(_, '[PAD]') for _ in pred_tags]

new_tokens, new_labels = [], []

for token, tag in zip(tokens, preds):
    
    if token == '[SEP]' or token == '[CLS]' or token == '[PAD]':
        continue
        
    if '##' in token:
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(tag)
        new_tokens.append(token)

In [69]:
for word, tag in zip(new_tokens, new_labels):
    print("{:15} {}".format(word, tag))

mateus          B-per
garcia          I-per
lives           O
in              O
picos           B-geo
piaui           I-geo
,               O
brazil          B-geo
,               O
alone           O
and             O
also            O
has             O
a               O
beautiful       O
little          O
house           O
in              O
crato           B-geo
of              O
ceara           B-geo
.               O
in              O
the             O
next            O
month           O
he              O
will            O
work            O
at              O
the             O
new             O
company         O
brisanet        B-org
telecom         I-org
how             O
to              O
develop         O
software        O
during          O
this            O
corona          O
virus           O
pandemic        O
