In [48]:
import pickle

import json

import numpy as np
import dill
import re
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from tqdm import tqdm
from sklearn.preprocessing import LabelBinarizer
from itertools import chain
from sklearn.metrics import classification_report
from deeple_preprocessor.tokenize import newmm_tokenize

In [53]:
with open('dataset/ner.data', 'rb') as file:
    datatofile = dill.load(file)
    
tagged_sents = []
prefixs = ['นาย', 'นาง', 'น.ส.', 'นางสาว', 'น.พ.', 'นพ.', 'พ.ต.ท.']

def replace_prefix(text):
    for i in prefixs:
        text = text.replace(i, '')
    return text

for i in datatofile:
    is_contain_prefix = False
    text_inside = []
    if(len(i) <= 1):
        continue
    for idx, j in enumerate(i):
        tag = j[2]
        if('ไม่ยืนยัน' in j[1]):
            tag = 'O'
        if(any([prefix in j[0] for prefix in prefixs])):
            is_contain_prefix = True
        ## Add S and E tag             
#         if('B-' in tag):
#             if(idx == len(i)-1 or 'I-' not in i[idx+1][1]):
#                 tag = tag.replace('B-', 'S-')
#         if('I-' in tag):
#             if(idx == len(i)-1 or 'I-' not in i[idx+1][1]):
#                 tag = tag.replace('I-', 'E-')
            
        text_inside.append((j[0],tag))
    tagged_sents.append(text_inside)
    
    
    if(is_contain_prefix):
        dup_text_inside = []
        for word, tag in text_inside:
            word = replace_prefix(word)
            if(word == ''):
                continue
            else:
                dup_text_inside.append((word, tag))
        tagged_sents.append(dup_text_inside)
    
train_sents, test_sents= train_test_split(tagged_sents, test_size=0.1, random_state=112)

In [55]:
thai2fit_model = KeyedVectors.load_word2vec_format('thai2fit/thai2vecNoSym.bin',binary=True)
thai2fit_weight = thai2fit_model.vectors

In [56]:
word_list=[]
ner_list=[]
thai2dict = {}

for sent in train_sents:
    for word in sent:
        word_list.append(word[0])
        ner_list.append(word[1])
        
for word in thai2fit_model.index2word:
    thai2dict[word] = thai2fit_model[word]

word_list.append("pad")
word_list.append("unknown") #Special Token for Unknown words ("UNK")
ner_list.append("pad")

all_words = sorted(set(word_list))
all_ner = sorted(set(ner_list))
all_thai2dict = sorted(set(thai2dict))

word_to_ix = dict((c, i) for i, c in enumerate(all_words)) #convert word to index 
ner_to_ix = dict((c, i) for i, c in enumerate(all_ner)) #convert ner to index
thai2dict_to_ix = dict((c, i) for i, c in enumerate(thai2dict)) #convert thai2fit to index 

ix_to_word = dict((v,k) for k,v in word_to_ix.items()) #convert index to word
ix_to_ner = dict((v,k) for k,v in ner_to_ix.items())  #convert index to ner
ix_to_thai2dict = dict((v,k) for k,v in thai2dict_to_ix.items())  #convert index to thai2fit

n_word = len(word_to_ix)
n_tag = len(ner_to_ix)
n_thai2dict = len(thai2dict_to_ix)

In [57]:
chars = set([w_i for w in thai2dict for w_i in w])
char2idx = {c: i + 5 for i, c in enumerate(chars)}

char2idx["pad"] = 0
char2idx["unknown"] = 1
char2idx[" "] = 2

char2idx["$"] = 3
char2idx["#"] = 4
char2idx["!"] = 5
char2idx["%"] = 6
char2idx["&"] = 7
char2idx["*"] = 8
char2idx["+"] = 9
char2idx[","] = 10
char2idx["-"] = 11
char2idx["."] = 12
char2idx["/"] = 13
char2idx[":"] = 14
char2idx[";"] = 15
char2idx["?"] = 16
char2idx["@"] = 17
char2idx["^"] = 18
char2idx["_"] = 19
char2idx["`"] = 20
char2idx["="] = 21
char2idx["|"] = 22
char2idx["~"] = 23
char2idx["'"] = 24
char2idx['"'] = 25

char2idx["("] = 26
char2idx[")"] = 27
char2idx["{"] = 28
char2idx["}"] = 29
char2idx["<"] = 30
char2idx[">"] = 31
char2idx["["] = 32
char2idx["]"] = 33
char2idx["\n"] = 34

n_chars = len(char2idx)

In [58]:
# with open('saved_model/chardict.json', 'w') as chardict:
#     json.dump(char2idx, chardict)
# with open('saved_model/nerdict.json', 'w') as nerdict:
#     json.dump(ner_to_ix, nerdict)
# with open('saved_model/thai2dict_to_ix.json', 'w') as f:
#     json.dump(thai2dict_to_ix, f)

In [59]:
max_len = 284
max_len_char = 30

character_LSTM_unit = 32
char_embedding_dim = 32
main_lstm_unit = 256 ## Bidirectional 256 + 256 = 512
lstm_recurrent_dropout = 0.5

train_batch_size = 32
train_epochs = 150

In [60]:
def prepare_sequence_word(input_text):
    idxs = list()
    for word in input_text:
        if word in thai2dict:
            idxs.append(thai2dict_to_ix[word])
        else:
            idxs.append(thai2dict_to_ix["unknown"]) #Use UNK tag for unknown word
    return idxs

def prepare_sequence_target(input_label):
    idxs = [ner_to_ix[w] for w in input_label]
    return idxs

In [61]:
input_sent =[ [ word[0] for word in sent]for sent in train_sents ] #words only
train_targets =[ [ word[1] for word in sent]for sent in train_sents ] #NER only

input_test_sent =[ [ word[0] for word in sent]for sent in test_sents ] #words only
test_targets =[ [ word[1] for word in sent]for sent in test_sents ] #NER only

In [62]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Model
from tensorflow.keras.layers import LSTM, Bidirectional, Input, TimeDistributed, AdditiveAttention
from tensorflow.keras.layers import Dense, SpatialDropout1D, Embedding, concatenate, Dropout
from tensorflow_addons.layers import GELU
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

from crf.crf import CRF
from crf.crf_losses import crf_loss

In [63]:
## Word Training
X_word_tr = [prepare_sequence_word(s) for s in input_sent]
X_word_tr = pad_sequences(
    maxlen=max_len, sequences=X_word_tr, value=thai2dict_to_ix["pad"], padding='post', truncating='post'
)

## Character Training
X_char_tr = []
for sentence in tqdm(train_sents):
    sent_seq = []
    for i in range(max_len):
        word_seq = []
        for j in range(max_len_char):
            try:
                if(sentence[i][0][j] in char2idx):
                    word_seq.append(char2idx.get(sentence[i][0][j]))
                else:
                    word_seq.append(char2idx.get("unknown"))
            except:
                word_seq.append(char2idx.get("pad"))
        sent_seq.append(word_seq)
    X_char_tr.append(np.array(sent_seq))

## Sequence Label Training
y_tr = [prepare_sequence_target(s) for s in train_targets]
y_tr = pad_sequences(maxlen=max_len, sequences=y_tr, value=ner_to_ix["pad"], padding='post', truncating='post')

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9008/9008 [00:33<00:00, 268.17it/s]


In [64]:
## Word Testing
X_word_te = [prepare_sequence_word(s) for s in input_test_sent]
X_word_te = pad_sequences(maxlen=max_len, sequences=X_word_te, value=thai2dict_to_ix["pad"], padding='post', truncating='post')

## Character Testing
X_char_te = []
for sentence in tqdm(test_sents):
    sent_seq = []
    for i in range(max_len):
        word_seq = []
        for j in range(max_len_char):
            try:
                if(sentence[i][0][j] in char2idx):
                    word_seq.append(char2idx.get(sentence[i][0][j]))
                else:
                    word_seq.append(char2idx.get("unknown"))
            except:
                word_seq.append(char2idx.get("pad"))    
        sent_seq.append(word_seq)
    X_char_te.append(np.array(sent_seq))

## Sequence Label Testing
y_te = [prepare_sequence_target(s) for s in test_targets]
y_te = pad_sequences(maxlen=max_len, sequences=y_te, value=ner_to_ix["pad"], padding='post', truncating='post')

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1001/1001 [00:03<00:00, 252.59it/s]


In [65]:
# Word Input
word_in = Input(shape=(max_len,), name='word_input_')

# Word Embedding Using Thai2Fit
word_embeddings = Embedding(input_dim=n_thai2dict,
                            output_dim=400,
                            weights = [thai2fit_weight],input_length=max_len,
                            mask_zero=False,
                            name='word_embedding', trainable=False)(word_in)

# Character Input
char_in = Input(shape=(max_len, max_len_char,), name='char_input')

# Character Embedding
emb_char = TimeDistributed(Embedding(input_dim=n_chars, output_dim=char_embedding_dim, 
                           input_length=max_len_char, mask_zero=False))(char_in)

# Character Sequence to Vector via BiLSTM
char_enc = TimeDistributed(Bidirectional(LSTM(units=character_LSTM_unit, return_sequences=False, recurrent_dropout=lstm_recurrent_dropout)))(emb_char)


# Concatenate All Embedding
all_word_embeddings = concatenate([word_embeddings, char_enc])
all_word_embeddings = SpatialDropout1D(0.3)(all_word_embeddings)

# Main Model Dense attention
main_lstm = Bidirectional(LSTM(units=main_lstm_unit, return_sequences=True, recurrent_dropout=lstm_recurrent_dropout))(all_word_embeddings)
main_lstm = TimeDistributed(Dense(50, activation="relu"))(main_lstm)

# CRF
out = CRF(n_tag)(main_lstm)  # CRF layer

# Model
model = Model(inputs=[word_in, char_in], outputs=out)

model.compile(optimizer="adam", loss=crf_loss, metrics=[Accuracy()])

model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         [(None, 284, 30)]    0                                            
__________________________________________________________________________________________________
word_input_ (InputLayer)        [(None, 284)]        0                                            
__________________________________________________________________________________________________
time_distributed_6 (TimeDistrib (None, 284, 30, 32)  12800       char_input[0][0]                 
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 284, 400)     22270800    word_input_[0][0]                
____________________________________________________________________________________________

In [66]:
# early_stopper = EarlyStopping(patience=5, restore_best_weights=True)

In [67]:
filepath="saved_model/weights-improvement-{epoch:02d}-{accuracy:.3f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

history = model.fit(
    [X_word_tr, np.array(X_char_tr).reshape((len(X_char_tr), max_len, max_len_char))],
    y_tr,
    batch_size=train_batch_size, epochs=1, verbose=1,callbacks=callbacks_list,
    validation_data=(
        [X_word_te, np.array(X_char_te).reshape((len(X_char_te), max_len, max_len_char))],
        y_te
    ),
    shuffle=True
)

Train on 9008 samples, validate on 1001 samples
Epoch 1/2
Epoch 00001: val_accuracy improved from -inf to 0.97854, saving model to saved_model/weights-improvement-01-0.961.hdf5
Epoch 2/2
Epoch 00002: val_accuracy improved from 0.97854 to 0.98700, saving model to saved_model/weights-improvement-02-0.982.hdf5


In [None]:
# save_filepath="saved_model/last_weight-50.hdf5"
# model.save_weights(save_filepath)

In [46]:
load_filepath="saved_model/weights-improvement-53-0.993.hdf5"
model.load_weights(load_filepath)

In [68]:
pred_model = model.predict([X_word_te,np.array(X_char_te).reshape((len(X_char_te),max_len, max_len_char))], verbose=1)



In [69]:
y_pred = []
y_true = []

for i in range(0,len(pred_model)):
    try:
        out = pred_model[i]
        true = y_te[i]
        revert_pred=[ix_to_ner[i] for i in out]
        revert_true=[ix_to_ner[i] for i in true]
        y_pred.append(revert_pred)
        y_true.append(revert_true)
    except:
        print (i)

In [70]:
def ner_classification_report(y_true, y_pred):
 
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
    tagset = list(sorted(set(lb.classes_)))
    tagset = [i for i in tagset if len(i.split('-')) == 2]
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    print(list(sorted(set(lb.classes_))))
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
        digits=4
    )

In [71]:
y_pred = np.array(y_pred)
y_true = np.array(y_true)

print(ner_classification_report(y_true, y_pred))

['B-DATE', 'B-EMAIL', 'B-LAW', 'B-LEN', 'B-LOCATION', 'B-MONEY', 'B-ORGANIZATION', 'B-PERCENT', 'B-PERSON', 'B-PHONE', 'B-TIME', 'B-URL', 'B-ZIP', 'I-DATE', 'I-EMAIL', 'I-LAW', 'I-LEN', 'I-LOCATION', 'I-MONEY', 'I-ORGANIZATION', 'I-PERCENT', 'I-PERSON', 'I-PHONE', 'I-TIME', 'I-URL', 'O', 'pad']
                precision    recall  f1-score   support

        B-DATE     0.7710    0.8312    0.8000       397
       B-EMAIL     0.0000    0.0000    0.0000         1
         B-LAW     0.0000    0.0000    0.0000        36
         B-LEN     0.0000    0.0000    0.0000        12
    B-LOCATION     0.8060    0.6639    0.7281       851
       B-MONEY     0.8571    0.8696    0.8633        69
B-ORGANIZATION     0.7507    0.7062    0.7278      1096
     B-PERCENT     0.0000    0.0000    0.0000        20
      B-PERSON     0.7485    0.7415    0.7450       851
       B-PHONE     0.0000    0.0000    0.0000        21
        B-TIME     0.7090    0.5249    0.6032       181
         B-URL     0.0000    0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
