In [1]:
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, TFBertModel, BertConfig
from sklearn.model_selection import train_test_split

from tqdm import tqdm

In [2]:
BERT_MODEL_NAME = 'bert-large-uncased' #'bert-base-uncased'

In [3]:
data = pd.read_csv('data/kaggle_ner.csv', sep=",", encoding="latin1").fillna(method='ffill')

In [4]:
class ContextNER:

    def __init__(self, df):

        self.__df = df

        self.all_words = set(df.Word.values)
        self.all_tags = set(df.Tag.values)

        self.num_words = len(self.all_words)
        self.num_tags = len(self.all_tags)

        self.sentences = self.__build_sentences()
        self.max_len = self.__get_maxlen()

        self.__build_Xy()
        self.__build_parsers()

    def __get_maxlen(self):
        return max([len(x) for x in self.sentences]) 

    def __build_sentences(self):

        return [x for x in self.__df.groupby('Sentence #').apply(
            lambda xdef: [x for x in zip(
                xdef.Word.values,
                xdef.Tag.values
            )]
        )]

    def __build_Xy(self):

        self.X = [[word for word, __ in value] for value in self.sentences]
        self.y = [[tag for __, tag in value] for value in self.sentences]

    def __build_parsers(self):
        
        self.tag2idx = {value: idx for idx, value in enumerate(self.all_tags)}
        self.idx2tag = {idx: value for value, idx in self.tag2idx.items()}

In [5]:
contextNER = ContextNER(data)

In [6]:
Words, Tags = contextNER.X, contextNER.y

In [7]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME, do_lower_case=True)

In [8]:
MAX_LEN = contextNER.max_len

In [9]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return  ["[CLS]"] + tokenized_sentence + ["[SEP]"], ['O'] + labels + ['O']

def pad_seq(seq, max_seq_length):
    return pad_sequences(seq,
                         maxlen=max_seq_length,
                         dtype="long",
                         truncating="post",
                         padding="post")

In [10]:
tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(Words, Tags)]

In [11]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [12]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, 
                          dtype="long", 
                          value=0.0,
                          truncating="post", 
                          padding="post")

In [13]:
tags = pad_sequences([[contextNER.tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, 
                     value=contextNER.tag2idx["O"], 
                     padding="post",
                     dtype="long", 
                     truncating="post")

In [14]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [15]:
tr_inputs, tr_tags = input_ids, tags
tr_masks = attention_masks

In [16]:
print("=" * 50)

index_sent = 156

for index_word, index_tag in zip(tr_inputs[index_sent], tr_tags[index_sent]):
    
    word = tokenizer.convert_ids_to_tokens(int(index_word))
    tag = contextNER.idx2tag.get(index_tag)
    
    if index_word != 0:
        print("{}\t|\t{}\t|\t{}\t|\t{}".format(index_word, word, tag, index_tag))

101	|	[CLS]	|	O	|	8
2028	|	one	|	O	|	8
2001	|	was	|	O	|	8
2741	|	sent	|	O	|	8
2067	|	back	|	O	|	8
2000	|	to	|	O	|	8
10411	|	sudan	|	B-geo	|	12
1010	|	,	|	O	|	8
2178	|	another	|	O	|	8
2000	|	to	|	O	|	8
8174	|	saudi	|	B-geo	|	12
9264	|	arabia	|	I-geo	|	1
1998	|	and	|	O	|	8
1996	|	the	|	O	|	8
2353	|	third	|	O	|	8
2000	|	to	|	O	|	8
5207	|	jordan	|	B-gpe	|	0
1012	|	.	|	O	|	8
102	|	[SEP]	|	O	|	8


In [17]:
for w, t in zip(Words[index_sent], Tags[index_sent]):
    print("{}\t|\t{}".format(w, t))

One	|	O
was	|	O
sent	|	O
back	|	O
to	|	O
Sudan	|	B-geo
,	|	O
another	|	O
to	|	O
Saudi	|	B-geo
Arabia	|	I-geo
and	|	O
the	|	O
third	|	O
to	|	O
Jordan	|	B-gpe
.	|	O


In [18]:
class ProcessingBERT:
    def __init__(self, contextNER):
    
        self._contextNER = contextNER;

    
    def __getitem__(self, item):
        
        text = self._contextNER.X[item]
        tags =  self._contextNER.y[item]

        ids = []
        target_tag = []

        for index, s in enumerate(text):
           
            inputs = tokenizer.encode(s, add_special_tokens=False)
            ids.extend(inputs)

            tag = self._contextNER.tag2idx.get(tags[index])
            
            index_pad = self._contextNER.tag2idx.get('O')
            
            target_tag.extend([tag] * len(inputs))

        ids = ids[:self._contextNER.max_len - 2]
        target_tag = target_tag[:self._contextNER.max_len - 2]
        
        ids = [101] + ids + [102]
        target_tag = [index_pad] + target_tag + [index_pad]
        
        mask = [1] * len(ids)
        token_type_ids = [0] * len(ids)
        
        padding_len = self._contextNER.max_len - len(ids)

        ids = ids + ([0] * padding_len)
        mask = mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_tag = target_tag + ([index_pad] * padding_len)

        return {
            "ids": np.array(ids),
            "mask": np.array(mask),
            "token_type_ids": np.array(token_type_ids),
            "target_tag": np.array(target_tag),
        }

In [19]:
cc = ProcessingBERT(contextNER)

In [20]:
cc[0]

{'ids': array([  101,  5190,  1997, 28337,  2031,  9847,  2083,  2414,  2000,
         6186,  1996,  2162,  1999,  5712,  1998,  5157,  1996, 10534,
         1997,  2329,  3629,  2013,  2008,  2406,  1012,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]),
 'mask': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 

In [21]:
idx = 908

for tagx, index in zip(Tags[idx], cc[idx]['target_tag']):
    tag = contextNER.idx2tag.get(index)
    print(tagx, '\t|\t', index, '\t|\t', tag)

O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
B-tim 	|	 8 	|	 O
O 	|	 15 	|	 B-tim
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
B-geo 	|	 8 	|	 O
O 	|	 12 	|	 B-geo
O 	|	 12 	|	 B-geo
O 	|	 12 	|	 B-geo
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
O 	|	 8 	|	 O
