# 1. 라브러리 준비

In [3]:
import numpy as np
import urllib.request

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# 2. 데이터 준비
'''
https://raw.githubusercontent.com/Franck-Dernoncourt/NeuroNER/master/neuroner/data/conll2003/en/train.txt
'''

In [4]:
tagged_sentences =[]
sentence = []

with urllib.request.urlopen('https://raw.githubusercontent.com/Franck-Dernoncourt/NeuroNER/master/neuroner/data/conll2003/en/train.txt') as f:
    for line in f:
        line = line.decode('utf-8')
        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
            if len(sentence) > 0:
                tagged_sentences.append(sentence)
                sentence =[]
            continue
        splits = line.strip().split(' ')
        word = splits[0].lower()
        sentence.append([word, splits[-1]])

print(len(tagged_sentences))
print(tagged_sentences)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# 3.데이터 전처리
## 3-1. 단어와 개체명 태그를 분리해서 데이터를 구성

In [5]:
sentences, ner_tags =[],[]

for tagged_sentence in tagged_sentences:
    sentence, tag_info =zip(*tagged_sentence)
    sentences.append(list(sentence))
    ner_tags.append(list(tag_info))

## 3-2. 정제 및 빈도 수가 높은 상위 단어들만 추출하기 위해 토큰화 작업

In [6]:
max_words = 4000
src_tokenizer = Tokenizer(num_words=max_words, oov_token='OOV')
src_tokenizer.fit_on_texts(sentences)

tar_tokenizer = Tokenizer()
tar_tokenizer.fit_on_texts(ner_tags)

In [7]:
vocab_size = max_words
tag_size = len(tar_tokenizer.word_index) + 1

print(vocab_size)
print(tag_size)

4000
10


## 3-3.데이터를 학습에 활용하기 위해 데이터를 배열로 변환
## 해당 작업은 토콘화 툴의 texts_to_sequences()를 통해 수행

In [8]:
X_train = src_tokenizer.texts_to_sequences(sentences)
y_train = tar_tokenizer.texts_to_sequences(ner_tags)

## 학습에 투입할 대는 동일한 길이를 갖고 있어야 하므로, 지정해 둔 최대 길이에 맞춰 모든 데이터를 동일한 길이로 맞춰준다.
## 일반적으로 길이를 맞출 때는 모자란 길이만큼 0을 추가

In [9]:
max_len = 70
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
y_train = pad_sequences(y_train, padding='post', maxlen=max_len)

## 훈련, 실험 데이터 분리 및 원 핫 인코딩을 시행

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=.2, random_state=111)

y_train = to_categorical(y_train, num_classes=tag_size)
y_test = to_categorical(y_test, num_classes=tag_size)

## 최종적으로 생성된 데이터셋의 크기는 다음과 같음

In [11]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(11232, 70)
(11232, 70, 10)
(2809, 70)
(2809, 70, 10)


# 4. 모델 구축 및 학습
## 모델 구성
### 1. 입력을 실수 벡터로 임베딩
### 2. 양뱡향 LSTM 구성
### 3. Dense layer를 통한 각 태그에 속할 확률 예측

In [12]:
import tensorflow as tf
tf.__version__

'2.9.0'

In [15]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional, TimeDistributed
from keras.optimizers import Adam

In [18]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(tag_size, activation='softmax')))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 70, 128)           512000    
                                                                 
 bidirectional (Bidirectiona  (None, 70, 512)          788480    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 70, 10)           5130      
 ibuted)                                                         
                                                                 
Total params: 1,305,610
Trainable params: 1,305,610
Non-trainable params: 0
_________________________________________________________________


## 4.2 모델 컴파일 및 학습 진행, 평가

In [19]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy']
             )

model.fit(X_train, y_train, batch_size=128, epochs=3, validation_data=(X_test, y_test))

Epoch 1/3


2023-06-16 15:11:43.306129: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-06-16 15:11:45.755433: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-06-16 15:11:46.394444: W tensorflow/core/common_runtime/forward_type_inference.cc:231] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_LEGACY_VARIANT
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	while inferring type of node 'cond_42/output/_19'
2023-06-16 15:11:46.397395: I tensorflow/core/grappler/optimizers/cu



2023-06-16 15:12:24.290945: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-06-16 15:12:24.566634: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-06-16 15:12:24.608865: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x298267250>

In [20]:
model.evaluate(X_test, y_test)



[0.05639009177684784, 0.9198888540267944]

# 5. 학습한 모델을 통한 예측
## 예측을 확인하기 위해서 인덱스를 단어로 변환해줄 사전이 필요
## 사전은 토큰화 툴의 사전을 이용

In [21]:
idx2word = src_tokenizer.index_word
idx2ner = tar_tokenizer.index_word
idx2ner[0] = 'PAD'

In [24]:
i = 60
y_predicted = model.predict(np.array([X_test[i]]))
y_predicted = np.argmax(y_predicted, axis=-1)

true = np.argmax(y_test[i], -1)

print("{:15} | {:5} | {}".format("단어","실제값","예측값"))
print("-" * 34)

for w, t, pred in zip(X_test[i], true, y_predicted[0]):
    if w != 0:
        print("{:17} | {:7} |{}".format(idx2word[w], idx2ner[t].upper(), idx2ner[pred].upper()))


단어              | 실제값   | 예측값
----------------------------------
feyenoord         | B-ORG   |B-ORG
midfielder        | O       |O
OOV               | B-PER   |B-PER
van               | I-PER   |B-PER
OOV               | I-PER   |I-PER
was               | O       |O
also              | O       |O
named             | O       |O
to                | O       |O
make              | O       |O
his               | O       |O
debut             | O       |O
in                | O       |O
the               | O       |O
OOV               | O       |O
squad             | O       |O
.                 | O       |O
