In [9]:
#!/usr/bin/env python
# encoding: utf-8
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, \
    Concatenate, Flatten, SpatialDropout1D, \
    BatchNormalization, Conv1D, Maximum, ZeroPadding1D
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.optimizers import Adam

def conv_unit(inp, n_gram, no_word=200, window=2):
    out = Conv1D(no_word, window, strides=1, padding="valid", activation='relu')(inp)
    out = TimeDistributed(Dense(5, input_shape=(n_gram, no_word)))(out)
    out = ZeroPadding1D(padding=(0, window - 1))(out)
    return out

def get_convo_nn2(no_word=200, n_gram=21, no_char=178):
    input1 = Input(shape=(n_gram,))
    input2 = Input(shape=(n_gram,))

    a = Embedding(no_char, 32, input_length=n_gram)(input1)
    a = SpatialDropout1D(0.15)(a)
    a = BatchNormalization()(a)

    a_concat = []
    for i in range(1,9):
        a_concat.append(conv_unit(a, n_gram, no_word, window=i))
    for i in range(9,12):
        a_concat.append(conv_unit(a, n_gram, no_word - 50, window=i))
    a_concat.append(conv_unit(a, n_gram, no_word - 100, window=12))
    a_sum = Maximum()(a_concat)

    b = Embedding(12, 12, input_length=n_gram)(input2)
    b = SpatialDropout1D(0.15)(b)

    x = Concatenate(axis=-1)([a, a_sum, b])
    #x = Concatenate(axis=-1)([a_sum, b])
    x = BatchNormalization()(x)

    x = Flatten()(x)
    x = Dense(100, activation='relu')(x)
    out = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[input1, input2], outputs=out)
    model.compile(optimizer=Adam(),
                  loss='binary_crossentropy', metrics=['acc'])
    return model


In [10]:
model = get_convo_nn2()
#model.load_weights('weight/model_weight.h5')  #For WS
model.load_weights('weight/model_weight_tnhc.h5') #For TNHC

In [11]:
from preprocessing import preprocess
from sklearn.model_selection import train_test_split
import copy as cp
from pythainlp.benchmarks.word_tokenization import benchmark
prepro = preprocess()

In [12]:
#test_list = ['wisesight-testset']
#test_list = ['testset']
test_list = ['tnhc_test']

x,y_true = prepro.preprocess_x_y(test_list)
y_true = [j for sub in y_true for j in sub if len(j) > 1]
x = [j for sub in x for j in sub if len(j) > 1]

In [13]:
y_pred=[]
for idx,item in enumerate(x):
    char_,type_ = prepro.create_feature_array(item)
    ans = model.predict([char_, type_])
    y_pred.append(ans)
    
y_pred_ = prepro.preprocessing_y_pred(y_pred)
y_pred = list(map(prepro.argmax_function,y_pred_))

In [14]:
y_test_data = [j for sub in y_pred for j in sub]
y_true_data = [j for sub in y_true for j in sub]

In [7]:
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
def eval_function(y_true,y_pred):
    f1_score_entropy=[]; 
    for index,_ in enumerate(y_pred):
        precision, recall, fscore, _ = precision_recall_fscore_support(y_true[index], y_pred[index], average='binary')
        #print(f'Precision:{precision}, Recall:{recall}, F1:{fscore}')
        f1_score_entropy.append(fscore)
    return np.mean(f1_score_entropy)

In [8]:
eval_function([y_true_data],[y_test_data])

0.9624969158647916

In [18]:
def cut(y_pred_boolean,x_data):
    x_ = cp.deepcopy(x_data)
    answer = []
    for idx,items in enumerate(y_pred_boolean):
        text = ""
        for index,item in enumerate(items):
            if(item == 1):
                text +='|'
            text +=x_[idx][index]
        answer.append(text)
    return answer 

In [19]:
y_true_data = [j for sub in y_true for j in sub]
y_original_data = [j for sub in y_pred for j in sub]

x_data = ''
for item in x:
    x_data+=item
    
dc_pred = [y_original_data]
dc_pred = cut(dc_pred,[x_data])
true_pred = [y_true_data]
true_pred = cut(true_pred,[x_data])

from itertools import accumulate
import operator

def evaluate(train : list, test: list) -> tuple:
    train_acc = list(accumulate(map(len, train), func = operator.add))
    test_acc = list(accumulate(map(len, test), func = operator.add))
    train_set = set(zip([0,*train_acc], train_acc))
    test_set = set(zip([0,*test_acc], test_acc))
    correct = len(train_set & test_set)
    pre = correct/len(test)
    re = correct/len(test)
    f1 = (2*pre*re)/(pre+re)
    return f1

dc_list = dc_pred[0].split('|')
true_list = true_pred[0].split('|')
print('Baseline:',evaluate(true_list,dc_list))

Baseline: 0.8792367221630863


In [20]:
def list2word(l):
	data = []
	temp = ""
	for i,c in enumerate(l):
		if (c == 1 and i>0):
			data.append(temp)
			temp = ""
		temp += str(c)
		if i==len(l)-1:
			data.append(temp)
	return data

def correct2(y_true,y_pred):
	l_true = list2word(y_true)
	l_pred = list2word(y_pred)
	#print(l_true)
	#print(l_pred)
	correct = []
	char_index = 0
	for i,w in enumerate(l_pred):
		#print(w)
		if w in l_true:
			if (y_true[char_index]+y_pred[char_index]==2 and y_true[char_index:char_index+len(w)-1]==y_pred[char_index:char_index+len(w)-1]): # ถ้า index char เป็น 1 ทั้งค่าจริงและทำนาย และ ถ้า list word นั้นเหมือนกันทั้งค่าจริงและทำนาย
				correct.append(w)
		char_index+=len(w)
	num_correct = len(correct)
	precision = num_correct/len(l_pred)
	recall = num_correct/len(l_true)
	#print(precision,recall,correct)
	try:
		f1 = 2*precision*recall / (precision + recall)
	except ZeroDivisionError:
		f1=0.0
	return {"precision":precision,"recall":recall,"f1":f1}

In [21]:
correct2(y_true_data,y_original_data)

{'precision': 0.9228828117936199,
 'recall': 0.9371489145041608,
 'f1': 0.9299611539512809}