개체명 인식 훈련(LSTM)

In [9]:
import numpy as np
import urllib.request

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Bidirectional, TimeDistributed
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

데이터 준비
   - https://raw.githubusercontent.com/Franck-Dernoncourt/NeuroNER/master/neuroner/data/conll2003/en/train.txt

In [2]:
tagged_sentences = []
sentence = []

with urllib.request.urlopen('https://raw.githubusercontent.com/Franck-Dernoncourt/NeuroNER/master/neuroner/data/conll2003/en/train.txt') as f:
    for line in f:
        line = line.decode('utf-8')
        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] =="\n":
            if len(sentence) > 0:
                tagged_sentences.append(sentence)
                sentence=[]
            continue
        splits = line.strip().split(' ')
        word = splits[0].lower()
        sentence.append([word, splits[-1]])
print(len(tagged_sentences))
print(tagged_sentences[0])

14041
[['eu', 'B-ORG'], ['rejects', 'O'], ['german', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['british', 'B-MISC'], ['lamb', 'O'], ['.', 'O']]


데이터 전처리

In [3]:
# 단어와 개체명 태그를 분리해서 데이터를 구성

sentences, ner_tags = [], []

for tagged_sentence in tagged_sentences:
    sentence, tag_info = zip(*tagged_sentence)
    sentences.append(list(sentence))
    ner_tags.append(list(tag_info))

In [4]:
# 정제 및 빈도 수가 높은 상위 단어들만 추출하기 위해 토큰화 작업

max_words = 4000
src_tokenizer = Tokenizer(num_words=max_words, oov_token='OOV')
src_tokenizer.fit_on_texts(sentences)

tar_tokenizer = Tokenizer()
tar_tokenizer.fit_on_texts(ner_tags)

In [5]:
vocab_size = max_words
tag_size = len(tar_tokenizer.word_index) + 1

print(vocab_size, tag_size)

4000 10


In [6]:
# 데이터를 학습하기 위해서 데이터를 배열로 변환

x_train = src_tokenizer.texts_to_sequences(sentences)
y_train = tar_tokenizer.texts_to_sequences(ner_tags)

In [7]:
# padding (문장에 길이를 맞춰줌)

max_len = 70
x_train = pad_sequences(x_train, padding='post', maxlen=max_len)
y_train = pad_sequences(y_train, padding='post', maxlen=max_len)

In [8]:
# train과 test 분리, 원핫인코딩
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train,
                                                    test_size=0.2, 
                                                    random_state=66)

y_train = to_categorical(y_train, num_classes=tag_size)
y_test = to_categorical(y_test, num_classes=tag_size)

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(11232, 70) (11232, 70, 10) (2809, 70) (2809, 70, 10)


모델 생성 및 학습

In [10]:
model = Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(tag_size, activation='softmax')))

In [12]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x_train, y_train, epochs=5, batch_size=128, validation_data=(x_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2912e799948>

In [13]:
model.evaluate(x_test, y_test)



[0.03842393308877945, 0.9473790526390076]

예측

In [14]:
idx2word = src_tokenizer.index_word
idx2ner = tar_tokenizer.index_word
idx2ner[0] = 'PAD'

In [20]:
# 원하는 인덱스에서 예측값과 실제값 비교

i = 70
y_predict = model.predict(np.array([x_test[i]]))
y_predict = np.argmax(y_predict, axis=-1)
true = np.argmax(y_test[i], -1)

print("{:15}|{:5}".format("단어", "실제값", "예측값"))
print("-" * 34)

for w, t, pred in zip(x_test[i], true, y_predict[0]):
    if w != 0:
        print("{:17}: {:7} {}".format(idx2word[w], idx2ner[t].upper(), idx2ner[pred].upper()))

단어             |실제값  
----------------------------------
no               : O       O
one              : O       O
OOV              : O       O
from             : O       O
OOV              : O       O
.                : O       O
"                : O       O
