In [23]:
# Save / Load File
import dill
import pickle

# Plot Graph
import matplotlib.pyplot as plt

# Sklearn Report
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
from itertools import chain

# Load Vectors
from gensim.models import KeyedVectors

# Utility
from pythainlp.tokenize import word_tokenize
import numpy as np
import time

# Model Utility
from sklearn.model_selection import train_test_split
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load data

In [24]:
path_name = "../../dataset/data/"

with open(path_name + 'comment-pos.data', 'rb') as file:
    datatofile = dill.load(file)

tagged_sents = []
for data in datatofile:
    text_inside = []
    for word, pos, label in data:
        text_inside.append((word, label))
    tagged_sents.append(text_inside)

train_sents, test_sents = train_test_split(tagged_sents, test_size=0.2, random_state=42)
print(len(train_sents))
print(len(test_sents))
print(train_sents[1])

552
138
[('ถ้า', 'B-c'), ('เดินทาง', 'I-c'), ('กลางคืน', 'I-c'), ('ก็', 'I-c'), ('รถทัวร์', 'I-c'), ('ครับ', 'I-c'), (' ', 'I-c'), (' ', 'O'), ('เพราะ', 'B-p'), ('รถ', 'I-p'), ('ไม่', 'I-p'), ('เยอะ', 'I-p'), (' ', 'I-p'), ('ความเสี่ยง', 'I-p'), ('การ', 'I-p'), ('เกิด', 'I-p'), ('อุบัติ', 'I-p'), ('ห', 'I-p'), ('ตุ', 'I-p'), ('ก็', 'I-p'), ('น้อย', 'I-p'), ('(', 'I-p'), ('มั้ง', 'I-p'), (')', 'I-p'), (' ', 'I-p'), (' ', 'O'), ('ถ้า', 'B-c'), ('กลางวัน', 'I-c'), ('ก็', 'I-c'), ('เครื่องบิน', 'I-c'), ('ครับ', 'I-c'), (' ', 'I-c'), (' ', 'O'), ('เพราะ', 'B-p'), (' ', 'I-p'), ('มัน', 'I-p'), ('ใช้เวลา', 'I-p'), ('น้อย', 'I-p'), ('จะ', 'I-p'), ('ได้', 'I-p'), ('มี', 'I-p'), ('เวลา', 'I-p'), ('ระหว่าง', 'I-p'), ('วัน', 'I-p'), ('เยอะ', 'I-p'), ('ๆ', 'I-p'), (' ', 'I-p')]


In [25]:
thai2fit_model = KeyedVectors.load_word2vec_format('../../thai2vec/thai2vecNoSym.bin', binary=True)
thai2fit_weight = thai2fit_model.vectors
thai2fit_weight.shape

(55677, 400)

In [26]:
word_list = []
ner_list = []
thai2dict = {}

for sent in train_sents:
    for word, label in sent:
        word_list.append(word)
        ner_list.append(label)

for word in thai2fit_model.index_to_key:
    thai2dict[word] = thai2fit_model[word]

word_list.append("pad")
word_list.append("unknown") #Special Token for Unknown words ("UNK")
ner_list.append("pad")

all_word = sorted(set(word_list))
all_ner = sorted(set(ner_list))
all_thai2dict = sorted(set(thai2dict))

word_to_idx = dict((word, i) for i, word in enumerate(all_word)) #convert word to index
ner_to_idx = dict((label, i) for i, label in enumerate(all_ner)) #convert ner(label) to index
thai2dict_to_idx = dict((word, i) for i, word in enumerate(thai2dict)) #convert thai2fit to index

idx_to_word = dict((idx, word) for word, idx in word_to_idx.items()) #convert index to word
idx_to_ner = dict((idx,label) for label, idx in ner_to_idx.items()) #convert index to ner(label)
idx_to_thai2fit = dict((idx,word) for word, idx in thai2dict_to_idx.items())# convert index to thai2fit

n_word = len(word_to_idx)
n_tag = len(ner_to_idx)
n_thai2dict = len(thai2dict_to_idx)
print("n_word: ", n_word)
print("n_tag: ", n_tag)
print("n_thai2dict", n_thai2dict)
print(ner_to_idx)

n_word:  4542
n_tag:  6
n_thai2dict 55677
{'B-c': 0, 'B-p': 1, 'I-c': 2, 'I-p': 3, 'O': 4, 'pad': 5}


In [27]:
chars = set([w_i for w in thai2dict for w_i in w])
char2idx = {c: i + 5 for i, c in enumerate(chars)}

char2idx["pad"] = 0
char2idx["unknown"] = 1
char2idx[" "] = 2

char2idx["$"] = 3
char2idx["#"] = 4
char2idx["!"] = 5
char2idx["%"] = 6
char2idx["&"] = 7
char2idx["*"] = 8
char2idx["+"] = 9
char2idx[","] = 10
char2idx["-"] = 11
char2idx["."] = 12
char2idx["/"] = 13
char2idx[":"] = 14
char2idx[";"] = 15
char2idx["?"] = 16
char2idx["@"] = 17
char2idx["^"] = 18
char2idx["_"] = 19
char2idx["`"] = 20
char2idx["="] = 21
char2idx["|"] = 22
char2idx["~"] = 23
char2idx["'"] = 24
char2idx['"'] = 25

char2idx["("] = 26
char2idx[")"] = 27
char2idx["{"] = 28
char2idx["}"] = 29
char2idx["<"] = 30
char2idx[">"] = 31
char2idx["["] = 32
char2idx["]"] = 33

n_chars = len(char2idx)
print(n_chars)

399


In [28]:
max_len = 400
max_len_char = 32

character_LSTM_unit = 32
char_embedding_dim = 32
main_lstm_unit = 256 

# Prepare data

In [29]:
def prepare_sequence_word(list_sent):
    idxs = list()
    for word in list_sent:
        if word in thai2dict:
            idxs.append(thai2dict_to_idx[word])
        else:
            idxs.append(thai2dict_to_idx["unknown"]) #Use UNK tag for unknown word
    return idxs

def prepare_sequence_target(input_label):
    idxs = [ner_to_idx[BIO] for BIO in input_label]
    return idxs

In [30]:
input_sent =[ [ word for word, label in sent]for sent in train_sents ] #words only
train_targets =[ [ label for word, label in sent]for sent in train_sents ] #NER only

input_test_sent =[ [ word for word, label in sent]for sent in test_sents ] #words only
test_targets =[ [ label for word, label in sent]for sent in test_sents ] #NER only

In [31]:
## word Trainig
X_word_train = [prepare_sequence_word(list_sent) for list_sent in input_sent]
X_word_train = pad_sequences(maxlen=max_len, sequences=X_word_train, value=thai2dict_to_idx["pad"], padding='post', truncating='post')

## character Training
X_char_train = []
for sentence in train_sents:
    sent_seq = []
    for i in range(max_len):
        word_seq = []
        for j in range(max_len_char):
            try:
                if sentence[i][0][j] in char2idx:
                    word_seq.append(char2idx[sentence[i][0][j]])
                else:
                    word_seq.append(char2idx["unknown"])
            except:
                word_seq.append(char2idx["pad"])
        sent_seq.append(word_seq)
    X_char_train.append(np.array(sent_seq))


y_train = [prepare_sequence_target(labels_sent) for labels_sent in train_targets ]
y_train = pad_sequences(maxlen=max_len, sequences=y_train, value=ner_to_idx["pad"], padding='post', truncating='post')
y_train = [to_categorical(idx_ner, num_classes=n_tag) for idx_ner in y_train]

In [32]:
print("X_train: ",X_word_train.shape)
# print("y_tain: ",len(y_train), "=>", y_train[0].shape)

y_train = np.array(y_train)
print("y_train: ", y_train.shape)

X_train:  (552, 400)
y_train:  (552, 400, 6)


In [33]:
## word Testing
X_word_test = [prepare_sequence_word(list_sent) for list_sent in input_test_sent]
X_word_test = pad_sequences(maxlen=max_len, sequences=X_word_test, value=thai2dict_to_idx["pad"], padding='post', truncating='post')

## character Training
X_char_test = []
for sentence in test_sents:
    sent_seq = []
    for i in range(max_len):
        word_seq = []
        for j in range(max_len_char):
            try:
                if sentence[i][0][j] in char2idx:
                    word_seq.append(char2idx[sentence[i][0][j]])
                else:
                    word_seq.append(char2idx["unknown"])
            except:
                word_seq.append(char2idx["pad"])
        sent_seq.append(word_seq)
    X_char_test.append(np.array(sent_seq))


y_test = [prepare_sequence_target(labels_sent) for labels_sent in test_targets ]
y_test = pad_sequences(maxlen=max_len, sequences=y_test, value=ner_to_idx["pad"], padding='post', truncating='post')
y_test = [to_categorical(idx_ner, num_classes=n_tag) for idx_ner in y_test]

In [34]:
print("X_train: ",X_word_test.shape)
# print("y_: ",len(y_test), "=>", y_test[0].shape)

y_test = np.array(y_test)
print("y_test: ", y_test.shape)

X_train:  (138, 400)
y_test:  (138, 400, 6)


# train Model

In [35]:
# Keras Model
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input ,LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D
from tensorflow.keras.layers import concatenate, SpatialDropout1D, GlobalMaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow import keras

In [36]:
#word Input
word_in = Input(shape=(max_len), name='word_input_')

#word Enbedding Using Thai2Fit
word_embeddings = Embedding(input_dim=n_thai2dict, output_dim=400, weights = [thai2fit_weight], input_length=max_len,
                                               mask_zero=False, trainable=False, name="word_embedding")(word_in)

# Character Input
char_in = Input(shape=(max_len, max_len_char,), name='char_input')

# Character Embedding
emb_char = TimeDistributed(Embedding(input_dim=n_chars, output_dim=char_embedding_dim, 
                           input_length=max_len_char, mask_zero=False))(char_in)

# Character Sequence to Vector via BiLSTM
char_enc = TimeDistributed(LSTM(units=character_LSTM_unit, return_sequences=False))(emb_char)

# Concatenate All Embedding
all_word_embeddings = concatenate([word_embeddings, char_enc])
all_word_embeddings = SpatialDropout1D(0.3)(all_word_embeddings)

main_lstm = LSTM(units=main_lstm_unit, return_sequences=True,)(all_word_embeddings)
dens = TimeDistributed(Dense(100, activation="relu"))(main_lstm)
out = Dense(n_tag, activation="softmax")(dens)
model = keras.Model(inputs=[word_in, char_in], outputs=[out])

model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(learning_rate=0.001))
model.summary()


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_input (InputLayer)         [(None, 400, 32)]    0                                            
__________________________________________________________________________________________________
word_input_ (InputLayer)        [(None, 400)]        0                                            
__________________________________________________________________________________________________
time_distributed_3 (TimeDistrib (None, 400, 32, 32)  12768       char_input[0][0]                 
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 400, 400)     22270800    word_input_[0][0]                
____________________________________________________________________________________________

In [37]:
model.fit([X_word_train, np.array(X_char_train)], y_train, batch_size=64, epochs=40, verbose=1,  validation_data=([X_word_test, np.array(X_char_test)], y_test))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x1f44d0fc670>

In [38]:
pred_model = model.predict([X_word_test,np.array(X_char_test)])

In [39]:
y_pred = []
y_true = []

for i in range(0,len(pred_model)):
    out = np.argmax(pred_model[i], axis=-1)
    true = np.argmax(y_test[i], axis=-1)
    revert_pred=[idx_to_ner[i] for i in out]
    revert_true=[idx_to_ner[i] for i in true]
    y_pred.append(revert_pred)
    y_true.append(revert_true)

In [40]:
y_real = [[y for y in tag if y != "pad"]for tag in y_true]
y_hat = [[y_ for y_ in tag if y_ != "pad"]for tag in y_pred]

In [41]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

def ner_classification_report(y_true, y_pred):
 
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
    print("accuracy",accuracy_score(y_true_combined, y_pred_combined))
    tagset = list(sorted(set(lb.classes_)))
    tagset = tagset[:-1]
    print(tagset)
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
        zero_division=0
    )
  
print(ner_classification_report(y_real,y_hat))

accuracy 0.6076181734740707
['B-c', 'B-p', 'I-c', 'I-p']
              precision    recall  f1-score   support

         B-c       0.82      0.20      0.32       157
         B-p       0.75      0.48      0.59       158
         I-c       0.58      0.51      0.54      1735
         I-p       0.60      0.81      0.69      3727

   micro avg       0.60      0.70      0.64      5777
   macro avg       0.69      0.50      0.53      5777
weighted avg       0.60      0.70      0.63      5777
 samples avg       0.46      0.46      0.46      5777



In [42]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

print("accuracy:" ,accuracy_score(y_real, y_hat))
print(classification_report(y_real, y_hat))

accuracy: 0.6076181734740707
              precision    recall  f1-score   support

           c       0.13      0.17      0.15       157
           p       0.11      0.25      0.16       158

   micro avg       0.12      0.21      0.15       315
   macro avg       0.12      0.21      0.15       315
weighted avg       0.12      0.21      0.15       315



In [43]:
path = "../../trained_model/LSTM/"

with open(path+'chardict.pickle', 'wb') as chardict:
    pickle.dump(char2idx, chardict)
    
with open(path+'nerdict.pickle', 'wb') as nerdict:
    pickle.dump(ner_to_idx, nerdict)
    
model.save_weights(path+"model_LSTM.hdf5")