In [4]:
import tensorflow as tf
from tensorflow.keras import preprocessing
from sklearn.model_selection import train_test_split
import numpy as np
from Preprocess import Preprocess

def read_file(file_name):
    sents = []
    with open(file_name, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for idx, l in enumerate(lines):
            if l[0] == ';' and lines[idx + 1][0] == '$':
                this_sent = []
            elif l[0] =='$' and lines[idx -1][0] == ';':
                continue
            elif l[0] =='\n':
                sents.append(this_sent)
            else:
                this_sent.append(tuple(l.split()))
    return sents

p = Preprocess(word2index_dic='../../train_tools/dict/chatbot_dict.bin',userdic='../../utils/user_dic.tsv')

corpus = read_file('ner_train.txt')
sentences, tags= [],[]
for t in corpus:
    tagged_sentence = []
    sentence, bio_tag = [],[]
    for w in t:
        tagged_sentence.append((w[1],w[3]))
        sentence.append(w[1])
        bio_tag.append(w[3])
    sentences.append(sentence)
    tags.append(bio_tag)
print('샘플 크기 : \n',len(sentences))
print("0번째 샘플 단어 시퀸스 : \n", sentences[0])
print("0번째 bio 태그 : \n", tags[0])
print("샘플 단어 시퀸스 최대 길이 : ", max(len(l) for l in sentences)) # 최대 단어 길이
print("샘플 시퀸스 평균 길이 : ",(sum(map(len, sentences))/len(sentences)))

#토크나이저 정의
tag_tokenizer = preprocessing.text.Tokenizer(lower=False)
tag_tokenizer.fit_on_texts(tags)

vocab_size = len(p.word_index) +1
tag_size = len(tag_tokenizer.word_index) + 1
print("BIO 태그 사전 크기 : ",tag_size)
print("단어 사전 크기 :", vocab_size)

#학습용 예제
x_train = [p.get_wordidx_sequence(sent) for sent in sentences]
y_train = tag_tokenizer.texts_to_sequences(tags)

index_to_ner = tag_tokenizer.index_word
index_to_ner[0] = 'PAD'

max_len = 40
x_train = preprocessing.sequence.pad_sequences(x_train, padding='post', maxlen = max_len)
y_train = preprocessing.sequence.pad_sequences(y_train, padding= 'post', maxlen =max_len)

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size= .2, random_state = 1234)
#출력데이터
y_train = tf.keras.utils.to_categorical(y_train, num_classes=tag_size)
y_test = tf.keras.utils.to_categorical(y_test,num_classes=tag_size)

print("학습 시퀸스 형상 : ", x_train.shape)
print("학습 샘플 레이블 형상 : ", x_test.shape)
print("텍스트 샘플 시퀸스 형상 : ", y_train.shape)
print("텍스트 샘플 레이블 형상 : ", y_test.shape)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Embedding(input_dim= vocab_size, output_dim=30, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(200, return_sequences=True, dropout=0.50, recurrent_dropout= 0.25)))
model.add(TimeDistributed(Dense(tag_size, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.01), metrics=['accuracy'])
model.fit(x_train, y_train, batch_size = 128, epochs = 10)

print("평과 결과 :", model.evaluate(x_test, y_test)[1])
model.save('ner_model.h5')

def sequences_to_tag(sequences):
    result = []
    for sequence in sequences:
        temp= []
        for pred in sequence:
            pred_index = np.argmax(pred)
            temp.append(index_to_ner[pred_index].replace("PAD","0"))
        result.append(temp)
    return result
from seqeval.metrics import f1_score, classification_report
#데이터셋 ner 예측
y_predicted = model.predict(x_test)
pred_tags = sequences_to_tag(y_predicted)
test_tags = sequences_to_tag(y_test)

print(classification_report(test_tags, pred_tags))
print("F1_score : {:.1%}".format(f1_score(test_tags, pred_tags)))



hi
샘플 크기 : 
 61999
0번째 샘플 단어 시퀸스 : 
 ['가락지빵', '주문', '하', '고', '싶', '어요']
0번째 bio 태그 : 
 ['B_FOOD', 'O', 'O', 'O', 'O', 'O']
샘플 단어 시퀸스 최대 길이 :  168
샘플 시퀸스 평균 길이 :  8.796238649010467
BIO 태그 사전 크기 :  10
단어 사전 크기 : 17869
학습 시퀸스 형상 :  (49599, 40)
학습 샘플 레이블 형상 :  (12400, 40)
텍스트 샘플 시퀸스 형상 :  (49599, 40, 10)
텍스트 샘플 레이블 형상 :  (12400, 40, 10)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
평과 결과 : 0.9859992861747742
              precision    recall  f1-score   support

          NP       1.00      1.00      1.00       303
           _       0.55      0.55      0.55       647
         _DT       1.00      1.00      1.00     13683
       _FOOD       1.00      1.00      1.00     11655
         _LC       0.70      0.64      0.67       314
         _OG       0.48      0.58      0.52       460
         _PS       0.66      0.52      0.58       396
         _TI       0.68      0.79      0.73        61

   micro avg       0.97      0.97     

In [6]:
    import tensorflow as tf
    import numpy as np
    from tensorflow.keras import preprocessing

    class NerModel:
        def __init__(self, model_name, proprocess):
            self.index_to_ner ={1:'0',2:'B_DT',3:'B_FOOD',4:'I',5:'B_OG',6:'B_PS',7:'B_LC',8:'NNp', 9:'B_TI', 0:'PAD'}

            self.model = load_model(model_name)
            self.p = proprocess
        def predict(self, query):
            pos = self.p.pos(query)
            keywords = self.p.get_keywords(pos, without_tag=True)
            sequences = [self.p.get_wordidx_sequence(keywords)]

            max_len = 40
            padded_seqs = preprocessing.sequence.pad_sequences(sequences, padding = 'post', value = 0, maxlen = max_len)

            predict = self.model.predict(np.array([padded_seqs[0]]))
            predict_class = tf.math.argmax(predict, axis = -1)
        def predict_tags(self, query):
            pos = self.p.pos(query)
            keywords = self.p.get_keywords(pos, without_tag = True)
            sequences = [self.p.get_wordidx_sequence(keywords)]

            max_len = 40
            padded_seqs = preprocessing.sequence.pad_sequences(sequences, padding = 'post', value = 0, maxlen = max_len)

            predict = self.model.predict(np.array([padded_seqs[0]]))
            predict_class = tf.math.argmax(predict, axis = -1)
            tags = []

            for tag_idx in predict_class.numpy()[0]:
                if tag_idx == 1: continue
                tags.append(self.index_to_ner[tag_idx])
                if len(tags) == 0:
                    return None
                return tags
