In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!pip install --quiet tensorflow-text
!pip install --quiet tokenizers

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import re
import tensorflow as tf 
from tensorflow import keras 
from tensorflow.keras import Model,Input,layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import *

import tensorflow_hub as hub 
from tokenizers import BertWordPieceTokenizer
import tensorflow_text as text 

# !wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
# from conlleval import evaluate
import matplotlib.pyplot as plt
import seaborn as sns 
import string
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
encoder_url = 'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3'
bert_url = "https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/4"
# bert_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/2" empty.ADAM() f1 0.56

bert_layer = hub.KerasLayer(bert_url, trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode("utf-8")
tokenizer = BertWordPieceTokenizer(vocab=vocab_file, lowercase=False)

In [3]:
def load_sentences(filepath):

    final = []
    sentences = []

    with open(filepath, 'r') as f:
        
        for line in f.readlines():
            
            if (line == ('-DOCSTART- -X- -X- O\n') or line == '\n'):
                if len(sentences) > 0:
                    final.append(sentences)
                    sentences = []
            else:
                l = line.split(' ')
                sentences.append((l[0], l[3].strip('\n')))
    
    return final


def list_maker(data_set,col):
    v2 = []

    for i in data_set:
        v1 = []

        for j in i:
            v1.append(j[col])
        x = ' '.join(v1)
        x = re.sub('  ',' ',x)
        x = re.sub(r"\s+$", "", x, flags=re.UNICODE)
        x = re.sub("^\s+|\s+$", "", x, flags=re.UNICODE)
        v2.append(x)

        # v2.append(' '.join(v1))

    return v2

In [4]:
train_set  = load_sentences('/kaggle/input/conll2003-dataset/conll2003/eng.train')
test_set  = load_sentences('/kaggle/input/conll2003-dataset/conll2003/eng.testb')
validation_set  = load_sentences('/kaggle/input/conll2003-dataset/conll2003/eng.testa')

print('train_set_length :',len(train_set))
print('test_set_length :',len(test_set))
print('validation_set_length :',len(validation_set))

sentences_train = list_maker(train_set, 0)
tags_train = list_maker(train_set, 1)

sentences_test = list_maker(test_set, 0)
tags_test = list_maker(test_set, 1)

sentences_validation = list_maker(validation_set, 0)
tags_validation = list_maker(validation_set, 1)


j_tags = ' '.join(tags_train)
unique_tags = np.unique(j_tags.split())
num_tags = len(unique_tags) ### number of unique tags
# unique_tags =  '[PAD]' + unique_tags
print('number of unique tags :', num_tags)
print('unique tags:', unique_tags)

In [5]:
enc_2tags = {i+1:j for i,j in enumerate(unique_tags)}
tags_2enc = {j:i+1 for i,j in enumerate(unique_tags)}

def tag_encoder(tags):
    '''
    encoding tags using tag corpus(enc_2tags)
    '''

    encoded_tags = []

    for i in tags:
        t1 = []

        for j in i.split():

            t1.append(tags_2enc[str(j)])
        encoded_tags.append(t1)

    return encoded_tags

In [6]:
class str_maker :
    def __init__(self, data):
        self.data = data 

    def splitter(self):
        str_data = []
        for i in self.data:
            str_data.append(' '.join(i.split()))

        return str_data

In [7]:
def token_aligner(sentences_train,tags_train):
    
    var2 = []
    for j in range(len(sentences_train)):
        encoded_sentence = tokenizer.encode(sentences_train[j])
        tags = tags_train[j].split(' ')
        counter = 0
        var1 = []
        for i in range(1, len(encoded_sentence.offsets)- 1):
            x = encoded_sentence.offsets[i][0]
            y = encoded_sentence.offsets[i-1][1]

            if x != y :
                counter += 1
                var1.append(tags[counter])
            if x == y:
                var1.append(tags[counter]) 
        var2.append(var1)
    
    return var2

In [8]:
def encoder(x):
    tokenizer = BertWordPieceTokenizer(vocab=vocab_file, lowercase=True)
    ids = []
    type_ids = []
    # tokens = []
    # offsets = []
    attention_mask = []
    # special_tokens_mask =[]
    ids_len = []
    for i in range(len(x)):
        var1 = tokenizer.encode(x[i])
        ids.append(var1.ids)
        type_ids.append(var1.type_ids)
        # tokens.append(var1.tokens)
        # offsets.append(var1.offsets)
        attention_mask.append(var1.attention_mask)
        # special_tokens_mask.append(var1.special_tokens_mask)
        ids_len.append(len(var1.ids))
    return ids, type_ids, attention_mask,ids_len

In [9]:
aligned_tags_train = token_aligner(sentences_train,tags_train)
aligned_tags_val = token_aligner(sentences_validation,tags_validation)

train_ids, train_type_ids, train_attention_mask, ids_len = encoder(sentences_train)
encoded_tags_train = tag_encoder(tags_train)

val_ids, val_type_ids, val_attention_mask, ids_len = encoder(sentences_validation)
encoded_tags_val = tag_encoder(tags_validation)

In [10]:
# for i in (np.unique(np.concatenate(aligned_tags_train), return_counts = True)):
#     print(i)

# for i in (np.unique(np.concatenate(aligned_tags_val), return_counts = True)):
#     print(i[0], i[1])

In [11]:
pd.DataFrame(np.transpose(np.array(np.unique(np.concatenate(aligned_tags_val), return_counts = True))))

In [12]:
pd.DataFrame(np.transpose(np.array(np.unique(np.concatenate(aligned_tags_train), return_counts = True))))

In [13]:
input_len =128
def padder(x,pad_len):
    padded_var = pad_sequences(
    x,
    maxlen=pad_len,
    dtype='int32',
    padding='post',
    truncating='post',
    value=0.0
    )
    return padded_var

pad_ids_train = padder(train_ids,input_len)
pad_type_ids_train = padder(train_type_ids,input_len)
pad_attention_mask_train = padder(train_attention_mask,input_len)
# pad_tags_train = padder(encoded_tags_train, input_len)
pad_tags_train = padder(encoded_tags_train, 128)



pad_ids_val = padder(val_ids,input_len)
pad_type_ids_val = padder(val_type_ids,input_len)
pad_attention_mask_val = padder(val_attention_mask,input_len)
# pad_tags_val = padder(encoded_tags_val, input_len)
pad_tags_val = padder(encoded_tags_val, 128)

In [14]:
preprocessor = hub.KerasLayer(encoder_url)
encoder = hub.KerasLayer(bert_url,trainable=True)

In [15]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)

encoder_inputs = preprocessor(text_input)

outputs = encoder(encoder_inputs)
sequence_output = outputs["sequence_output"]
# sequence_output = Flatten()(sequence_output)
# d1_layer = Dense(128, activation = 'relu')(sequence_output)
# d2_layer = Dense(64, activation = 'relu')(d1_layer)
sequence_output = Dropout(0.1)(sequence_output)
final_layer = Dense(num_tags+1, activation = 'softmax')(sequence_output)

bert_ner_model = Model(inputs = text_input, outputs = final_layer)

In [16]:
bert_ner_model.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
#     loss = tf.keras.losses.CategoricalCrossentropy(),
    optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-5),
    metrics = ['accuracy']
                )

bert_ner_model.fit(
    np.array(sentences_train),
    pad_tags_train,
    # deneme,
    epochs = 3,
    batch_size = 20
                )

In [17]:
p = bert_ner_model.predict(np.array(sentences_validation))

non_zero_pos = []
for i in range(len(pad_tags_val)):
    non_zero_pos.append(len(np.where(pad_tags_val[i] != 0)[0])) 

In [29]:
pad_tags_val[0]

In [18]:
d_pos = []
for i in range(len(non_zero_pos)):
    r = non_zero_pos[i]
    d_pos.append(tf.math.top_k(p[i][0:r], k =1)[1])
    
# d_pos = []
# for i in range(len(non_zero_pos)):
#     r = non_zero_pos[i]
#     d_pos.append(tf.math.top_k(p[i][0:r+1], k =1)[1])

In [19]:
from sklearn.metrics import f1_score,classification_report,confusion_matrix

f1_score((np.concatenate(encoded_tags_val)), (np.concatenate(d_pos)), average='micro')

In [20]:
print(classification_report(np.concatenate(encoded_tags_val), np.concatenate(d_pos),
                            target_names = ['0','1','2','3','4','5','6','7','8','9']
                            ))

In [27]:
np.unique(np.concatenate(d_pos),return_counts = True)

In [21]:
conf_mat=confusion_matrix(np.concatenate(encoded_tags_val), np.concatenate(d_pos))
print(conf_mat)

In [22]:
ax = sns.heatmap(conf_mat, linewidth=0.5)
plt.show()

In [23]:
# fig, ax = plt.subplots()
# im = ax.imshow(conf_mat)
plt.imshow(conf_mat, cmap='hot', interpolation='nearest')
plt.show()