In [19]:
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import preprocessing
from sklearn.model_selection import train_test_split
import numpy as np

def read_file(file_name):
    sents = []
    with open(file_name, 'r', encoding = 'utf-8') as f:
        lines = f.readlines()
        for idx, l in enumerate(lines):
            if l[0] == ';' and lines[idx +1][0] == '$':
                this_sent = []

            elif l[0] == '$' and lines[idx -1][0] ==';':
                continue
            elif l[0] =='\n':
                sents.append(this_sent)
            else:
                this_sent.append(tuple(l.split()))
    return sents
corpus = read_file('./train.txt')

sentences, tags = [],[]
for t in corpus:
    tagged_setence = []
    sentence, bio_tag = [],[]
    for w in t:
        tagged_setence.append((w[1],w[3]))
        sentence.append(w[1])
        bio_tag.append(w[3])

    sentences.append(sentence)
    tags.append(bio_tag)
print('샘플 크기: \n', len(sentences))
print("0번째 샘플 문장 시퀸스 : \n",sentences[0])
print("0번째 샘플 bio 태그 : \n", tags[0])
print("샘플 문장 시퀸스 최대 길이 : ", max(len(l) for l in sentences))
print("샘플 문장 평균 길이 : ", (sum(map(len, sentences))/len(sentences)))

sent_tokenizer = preprocessing.text.Tokenizer(oov_token='OOV') # 첫번째 인덱스에 oov사용
sent_tokenizer.fit_on_texts(sentences)
tag_tokenizer = preprocessing.text.Tokenizer(lower= False)
tag_tokenizer.fit_on_texts(tags)
vocab_size = len(sent_tokenizer.word_index) + 1
tag_size =len(tag_tokenizer.word_index) +1
print("태그 사전 크기 : ", tag_size)
print("단어 사전 크기 : ", vocab_size)

x_train = sent_tokenizer.texts_to_sequences(sentences)
y_train = tag_tokenizer.texts_to_sequences(tags)
print(x_train[0])
print(y_train[0])

index_to_word = sent_tokenizer.index_word
index_to_ner = tag_tokenizer.index_word
index_to_ner[0] = 'PAD'

max_len = 40
x_train = preprocessing.sequence.pad_sequences(x_train, padding = 'post', maxlen = max_len)
y_train = preprocessing.sequence.pad_sequences(y_train, padding = 'post', maxlen = max_len)

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size = .2, random_state = 0)
y_train = tf.keras.utils.to_categorical(y_train, num_classes = tag_size)
y_test = tf.keras.utils.to_categorical(y_test, num_classes = tag_size)

print('학습 시퀸스 형상 : ', x_train.shape)
print('학습 샘플 레이블 형상 : ', y_train.shape)
print('테스트 샘플 시퀸스 형상 : ', x_test.shape)
print('샘플 레이블 형상 : ', y_test.shape)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = 30, input_length = max_len, mask_zero = True))
model.add(Bidirectional(LSTM(200, return_sequences = True, dropout = 0.50, recurrent_dropout = 0.25)))
model.add(TimeDistributed(Dense(tag_size, activation = 'softmax')))
model.compile(loss='categorical_crossentropy', optimizer =Adam(0.01), metrics = ['accuracy'])
model.fit(x_train, y_train, batch_size = 128, epochs = 10)
print("평과결과 : ", model.evaluate(x_test, y_test)[1])

def sequences_to_tag(sequences):
    result = []
    for sequence in sequences:
        temp = []
        for pred in sequence:
            pred_index = np.argmax(pred)
            temp.append(index_to_ner[pred_index].replace('PAD','0'))
        result.append(temp)
    return result

y_predicted = model.predict(x_test)
pred_tags = sequences_to_tag(y_predicted)
test_tags = sequences_to_tag(y_test)

from seqeval.metrics import f1_score, classification_report
print(classification_report(test_tags, pred_tags))
print("F1-score : {:0.1%}".format(f1_score(test_tags,pred_tags)))
word_to_index = sent_tokenizer.word_index
new_sentence = '삼성전자 출시 스마트폰 오늘 애플 도전장 내밀다.'.split()
new_x = []

for w in new_sentence:
    try:
        new_x.append(word_to_index.get(w,1))
    except KeyError:
        new_x.append(word_to_index['OOV'])
print('새로운 유형의 시퀸스 : ', new_x)
new_padded_seqs = preprocessing.sequence.pad_sequences([new_x], padding='post', value = 0, maxlen = max_len)

p = model.predict(np.array([new_padded_seqs[0]]))
p = np.argmax(p, axis = -1)

print('{:10} {:5}'.format("단어","예측된 NER"))
print("=" * 50)
for w, pred in zip(new_sentence, p[0]):
    print("{:10}{:5}".format(w, index_to_ner[pred]))

샘플 크기: 
 3555
0번째 샘플 문장 시퀸스 : 
 ['한편', ',', 'AFC', '챔피언스', '리그', 'E', '조', '에', '속하', 'ㄴ', '포항', '역시', '대회', '8강', '진출', '이', '불투명', '하', '다', '.']
0번째 샘플 bio 태그 : 
 ['O', 'O', 'O', 'O', 'O', 'B_OG', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
샘플 문장 시퀸스 최대 길이 :  168
샘플 문장 평균 길이 :  34.03909985935302
태그 사전 크기 :  8
단어 사전 크기 :  13834
[183, 11, 4276, 884, 162, 931, 402, 10, 2608, 7, 1516, 608, 145, 1361, 414, 4, 6347, 2, 8, 3]
[1, 1, 1, 1, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
학습 시퀸스 형상 :  (2844, 40)
학습 샘플 레이블 형상 :  (2844, 40, 8)
테스트 샘플 시퀸스 형상 :  (711, 40)
샘플 레이블 형상 :  (711, 40, 8)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
평과결과 :  0.9372843503952026
              precision    recall  f1-score   support

           _       0.59      0.60      0.59       644
         _DT       0.93      0.89      0.91       335
         _LC       0.70      0.55      0.62       312
         _OG       0.76  

NameError: name 'prnit' is not defined

In [17]:
pip install seqeval

Collecting seqevalNote: you may need to restart the kernel to use updated packages.
  Downloading seqeval-1.2.2.tar.gz (43 kB)
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py): started
  Building wheel for seqeval (setup.py): finished with status 'done'
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16175 sha256=8cabd64a6d597569b752f301623b1e7c403d59748d91bf32a75e7d15d8104c16
  Stored in directory: c:\users\my\appdata\local\pip\cache\wheels\ad\5c\ba\05fa33fa5855777b7d686e843ec07452f22a66a138e290e732

Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
