In [1]:
!pip install --quiet tensorflow-text
!pip install --quiet tokenizers
import numpy as np 
import pandas as pd 
from google.colab import drive
drive.mount('/content/drive')
import os 
import re
import tensorflow as tf 
from tensorflow import keras 
from tensorflow.keras import Model,Input,layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import *

import tensorflow_hub as hub 
import tokenizers
from tokenizers import BertWordPieceTokenizer
import tensorflow_text as text 

# !wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
# from conlleval import evaluate
# import matplotlib.pyplot as plt
import string
from tensorflow.keras.preprocessing.sequence import pad_sequences
from data_loader import load_sentences, list_maker
from model import ner_model
from helper_functions import tag_encoder, str_maker
import platform
import sklearn 
from sklearn.metrics import f1_score,classification_report,confusion_matrix

# !pip install --quiet pipreqs

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
print('python version :',platform.python_version())
print('regex version :',re.__version__)
print('numpy version :', np.__version__)
print('pandas version :', pd.__version__)
print('tf version :', tf.__version__)
print('tf_text version :',text.__version__)
print('tf_hub version :',hub.__version__)
print('huggingface_tokenizers version :',tokenizers.__version__)
print('sklearn version', sklearn.__version__)

python version : 3.7.13
regex version : 2.2.1
numpy version : 1.21.6
pandas version : 1.3.5
tf version : 2.10.0
tf_text version : 2.10.0
tf_hub version : 0.12.0
huggingface_tokenizers version : 0.12.1
sklearn version 1.0.2


In [3]:
encoder_url = 'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3'
# bert_url = "https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/4"
bert_url = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2'
# bert_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/2" empty.ADAM() f1 0.56

bert_layer = hub.KerasLayer(bert_url, trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode("utf-8")
tokenizer = BertWordPieceTokenizer(vocab=vocab_file, lowercase=False)



In [4]:
train_set  = load_sentences('drive/MyDrive/Colab Notebooks/datasets/conn2003/train.txt')
test_set  = load_sentences('drive/MyDrive/Colab Notebooks/datasets/conn2003/test.txt')
validation_set  = load_sentences('drive/MyDrive/Colab Notebooks/datasets/conn2003/valid.txt')

print('train_set_length :',len(train_set))
print('test_set_length :',len(test_set))
print('validation_set_length :',len(validation_set))

sentences_train = list_maker(train_set, 0)
tags_train = list_maker(train_set, 1)

sentences_test = list_maker(test_set, 0)
tags_test = list_maker(test_set, 1)

sentences_validation = list_maker(validation_set, 0)
tags_validation = list_maker(validation_set, 1)


j_tags = ' '.join(tags_train)
unique_tags = np.unique(j_tags.split())
num_tags = len(unique_tags) ### number of unique tags
# unique_tags =  '[PAD]' + unique_tags
print('number of unique tags :', num_tags)
print('unique tags:', unique_tags)

train_set_length : 14041
test_set_length : 3453
validation_set_length : 3250
number of unique tags : 9
unique tags: ['B-LOC' 'B-MISC' 'B-ORG' 'B-PER' 'I-LOC' 'I-MISC' 'I-ORG' 'I-PER' 'O']


In [5]:
enc_2tags = {i:j for i,j in enumerate(unique_tags)}
tags_2enc = {j:i for i,j in enumerate(unique_tags)}

In [6]:
def token_aligner(sentences_train,tags_train):
    var2 = []
    for j in range(len(sentences_train)):
        encoded_sentence = tokenizer.encode(sentences_train[j])
        tags = tags_train[j].split(' ')
        counter = 0
        var1 = []
        for i in range(1, len(encoded_sentence.offsets)- 1):
            x = encoded_sentence.offsets[i][0]
            y = encoded_sentence.offsets[i-1][1]

            if x!=y :
                counter +=1
                var1.append(tags[counter])
            if x == y:
                var1.append(tags[counter]) 
        var2.append(var1)
    
    return var2

In [7]:
def encoder(x):
    tokenizer = BertWordPieceTokenizer(vocab=vocab_file, lowercase=True)
    ids = []
    type_ids = []
    attention_mask = []
    ids_len = []
    for i in range(len(x)):
        var1 = tokenizer.encode(x[i])
        ids.append(var1.ids)
        type_ids.append(var1.type_ids)
        attention_mask.append(var1.attention_mask)
        ids_len.append(len(var1.ids))
    return ids, type_ids, attention_mask,ids_len

In [8]:
aligned_tags_train = token_aligner(sentences_train,tags_train)
aligned_tags_val = token_aligner(sentences_validation,tags_validation)

In [9]:
train_ids, train_type_ids, train_attention_mask,ids_len = encoder(sentences_train)
encoded_tags_train = tag_encoder(tags_train,tags_2enc)

In [10]:
val_ids, val_type_ids, val_attention_mask,ids_len = encoder(sentences_validation)
encoded_tags_val = tag_encoder(tags_validation,tags_2enc)

In [11]:
input_len =128
def padder(x,pad_len):
    padded_var = pad_sequences(
    x,
    maxlen=pad_len,
    dtype='int32',
    padding='post',
    truncating='post',
    value=0.0
    )
    return padded_var

pad_ids_train = padder(train_ids,input_len)
pad_type_ids_train = padder(train_type_ids,input_len)
pad_attention_mask_train = padder(train_attention_mask,input_len)
pad_tags_train = padder(encoded_tags_train, input_len)


pad_ids_val = padder(val_ids,input_len)
pad_type_ids_val = padder(val_type_ids,input_len)
pad_attention_mask_val = padder(val_attention_mask,input_len)
pad_tags_val = padder(encoded_tags_val, input_len)

In [12]:
preprocessor = hub.KerasLayer(encoder_url)
encoder = hub.KerasLayer(bert_url,trainable=True)

In [13]:
# def ner_model(num_tags):
#     encoder_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
#     bert_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/2"
# #     bert_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/2"


#     encoder = hub.KerasLayer(bert_url,trainable=True)
#     text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
#     encoder_inputs = preprocessor(text_input)
#     outputs = encoder(encoder_inputs)
#     sequence_output = outputs["sequence_output"]
#     sequence_output = Dropout(0.3)(sequence_output)
#     final_layer = Dense(num_tags, activation = 'softmax')(sequence_output)
#     return Model(inputs = text_input, outputs = final_layer)
bert_ner_model = ner_model(num_tags)

In [14]:
# bert_ner_model.compile(
#     loss = tf.keras.losses.SparseCategoricalCrossentropy(),
#     optimizer = tf.keras.optimizers.Adam(),
#     metrics = ['accuracy']
# )

# bert_ner_model.fit(
#     np.array(sentences_train),
#     pad_tags_train,
#     # deneme,
#     epochs = 1,
#     batch_size = 20
# )

In [15]:
# os.listdir('drive/MyDrive/Colab Notebooks/trained_models/bert_ner_model/model_weights')
# model_path = 'drive/MyDrive/Colab Notebooks/trained_models/bert_ner_model/model_weights'
# model_name = '/bert_ner_weights'
# model_path +model_name
# bert_ner_model.save_weights(model_path +model_name)
# os.listdir('drive/MyDrive/Colab Notebooks/trained_models/bert_ner_model/model_weights')


['bert_ner_weights.h5',
 'bert_ner_weights.data-00000-of-00001',
 'bert_ner_weights.index',
 'checkpoint']

In [16]:
bert_ner_model.load_weights('drive/MyDrive/Colab Notebooks/trained_models/bert_ner_model/model_weights/bert_ner_weights.h5')

In [17]:
# new_model = ner_model(num_tags)
# new_model.load_weights(model_path +model_name)


In [18]:
p = bert_ner_model.predict(np.array(sentences_validation))



In [19]:
top2 = []
v2 = []
for i in range(len(encoded_tags_val)):
    x = encoded_tags_val[i]
    v1 = []
    top1 = []
    for j in range(len(x)):
        v1.append(p[i][j])
        top1.append(int( tf.math.top_k(p[i][j],k=1)[1]))
    v2.append(v1)
    top2.append(top1)

In [20]:

f1_score((np.concatenate(encoded_tags_val)), (np.concatenate(top2)), average='micro')

0.8424126786340095

In [21]:
print(classification_report(np.concatenate(encoded_tags_val), np.concatenate(top2),
                            target_names = ['0','1','2','3','4','5','6','7','8']
                            ))

conf_mat=confusion_matrix(np.concatenate(encoded_tags_val), np.concatenate(top2))
print(conf_mat)

              precision    recall  f1-score   support

           0       0.26      0.72      0.38      1837
           1       0.72      0.42      0.53       922
           2       0.73      0.54      0.62      1341
           3       0.65      0.55      0.60      1842
           4       0.50      0.57      0.53       257
           5       0.70      0.36      0.48       346
           6       0.67      0.45      0.54       751
           7       0.71      0.47      0.57      1307
           8       0.93      0.90      0.92     42759

    accuracy                           0.84     51362
   macro avg       0.65      0.55      0.57     51362
weighted avg       0.88      0.84      0.85     51362

[[ 1325     6    17    16    22     0     6     0   445]
 [  110   390    22    17     5    10     1     2   365]
 [   94    15   725   110     2     0    29     4   362]
 [  190     4     8  1018     1     0     4    71   546]
 [   39     0     1     0   146     0     3     3    65]
 [   35   