今回は、feedforward neural networkを使って、固定ウィンドウ幅(左右３)の情報を使って単語の品詞タグを予測するモデルを学習してみます．  
かなりシンプルですが、高速に予測できるという魅力から、より強力なモデルが沢山存在する今でも、NLPのいろんなツールで使われてます．  
easyccg: https://github.com/mikelewis0/easyccg  
syntaxnet: https://github.com/tensorflow/models/tree/master/research/syntaxnet  
stanford parser: https://nlp.stanford.edu/software/lex-parser.shtml  
(イメージ)　　
<img src='images/ff.png'>
(画像: https://xbt.net/blog/what-is-enigma/)

In [1]:
# つかうライブラリの読み込み

import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Input, Reshape
from keras.optimizers import SGD
from collections import Counter

# 👇無視してOK

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
# NLPでよく出てくるCoNLLフォーマット
!head -20 data/test.conll

1	No	no	ADV	ADV	_	7	discourse	wsj_2300.1	_
2	,	,	PUNCT	PUNCT	_	7	punct	_	_
3	it	it	PRON	PRON	_	7	nsubj	_	_
4	was	be	VERB	VERB	_	7	cop	_	_
5	n't	not	PART	PART	_	7	neg	_	_
6	Black	Black	PROPN	PROPN	_	7	compound	_	_
7	Monday	Monday	PROPN	PROPN	_	0	root	_	_
8	.	.	PUNCT	PUNCT	_	7	punct	_	_

1	Once	once	ADV	ADV	_	2	advmod	wsj_2300.10	_
2	again	again	ADV	ADV	_	9	advmod	_	_
3	-LCB-	-lcb-	PUNCT	PUNCT	_	9	punct	_	_
4	the	the	DET	DET	_	5	det	_	_
5	specialists	specialist	NOUN	NOUN	_	9	nsubj	_	_
6	-RCB-	-rcb-	PUNCT	PUNCT	_	9	punct	_	_
7	were	be	VERB	VERB	_	9	cop	_	_
8	not	not	PART	PART	_	9	neg	_	_
9	able	able	ADJ	ADJ	_	24	ccomp	_	_
10	to	to	PART	PART	_	11	mark	_	_
11	handle	handle	VERB	VERB	_	9	xcomp	_	_


In [3]:
# CoNLLフォーマットを読み込む関数
def read_conll(file):
    res = []
    words = []
    tags = []
    for line in open(file):
        line = line.strip()
         # 空行ならそれまでにつくった文を出力
        if len(line) == 0:
            res.append((words, tags))
            words = []
            tags = []
        # 単語とタグを取り出す
        else:
            items = line.split('\t')
            words.append(items[1].lower()) # 小文字にしておく
            tags.append(items[3])
    return res

In [4]:
# 学習データ (training data)
train_sents = read_conll('data/train.conll')
# 評価用データ (test data)
test_sents = read_conll('data/test.conll')
# 開発データ (development data)
dev_sents = read_conll('data/dev.conll')

In [5]:
words, tags = test_sents[0] # 一番最初の文
print(words)
print(tags)

['no', ',', 'it', 'was', "n't", 'black', 'monday', '.']
['ADV', 'PUNCT', 'PRON', 'VERB', 'PART', 'PROPN', 'PROPN', 'PUNCT']


In [6]:
def sliding_windows(lst):
    res = []
    for i in range(len(lst) - 6):
        res.append(lst[i:i+7])
    return res

In [None]:
sliding_windows(list(range(10)))

In [7]:
print(words)
print(sliding_windows(words))

['no', ',', 'it', 'was', "n't", 'black', 'monday', '.']
[['no', ',', 'it', 'was', "n't", 'black', 'monday'], [',', 'it', 'was', "n't", 'black', 'monday', '.']]


👆これはだめ

In [8]:
PAD = 'PAD'

train_sents = [([PAD] * 3 + words + [PAD] * 3, tags) for words, tags in train_sents]
test_sents = [([PAD] * 3 + words + [PAD] * 3, tags) for words, tags in test_sents]
dev_sents = [([PAD] * 3 + words + [PAD] * 3, tags) for words, tags in dev_sents]

In [9]:
words, tags = test_sents[0]
print(words)
print(sliding_windows(words))

['PAD', 'PAD', 'PAD', 'no', ',', 'it', 'was', "n't", 'black', 'monday', '.', 'PAD', 'PAD', 'PAD']
[['PAD', 'PAD', 'PAD', 'no', ',', 'it', 'was'], ['PAD', 'PAD', 'no', ',', 'it', 'was', "n't"], ['PAD', 'no', ',', 'it', 'was', "n't", 'black'], ['no', ',', 'it', 'was', "n't", 'black', 'monday'], [',', 'it', 'was', "n't", 'black', 'monday', '.'], ['it', 'was', "n't", 'black', 'monday', '.', 'PAD'], ['was', "n't", 'black', 'monday', '.', 'PAD', 'PAD'], ["n't", 'black', 'monday', '.', 'PAD', 'PAD', 'PAD']]


In [10]:
#単語を自然数のIDに変換する辞書
UNK = 'UNK'

# 単語の埋め込みベクトルがうまくいくためには、その単語がいろいろな文脈で出現してほしい．
# 学習データにちょっと(２回より下)しか出ない単語はUNKで置き換える．
word_count = Counter(word for words, _ in train_sents for word in words)
word_set = [word for word, count in word_count.most_common() if count >= 2]
word_set.append(UNK)
word_dict = {w: i for i, w in enumerate(word_set)}

In [11]:
#POSタグを自然数のIDに変換する辞書
tag_set = set(tag for _, tags in train_sents for tag in tags)
tag_dict = {w: i for i, w in enumerate(tag_set)}

In [12]:
print('word_dict size', len(word_dict))
print('tag_dict size', len(tag_dict))

word_dict size 21569
tag_dict size 17


In [13]:
print(word_dict['dog']) # dogのidは？

6580


In [14]:
# データを行列に変換
xs = []
ys = []
for words, tags in train_sents:
    for window in sliding_windows(words):
        xs.append([word_dict.get(word, word_dict[UNK]) for word in window])
    ys.extend(tag_dict[tag] for tag in tags)

In [15]:
# numpyの行列に変換
xs = np.array(xs, 'i')
ys = np.array(ys, 'i')
ys = keras.utils.to_categorical(ys)

In [16]:
print('dimensions of xs', xs.shape)
print('dimensions of ys', ys.shape)

dimensions of xs (929552, 7)
dimensions of ys (929552, 17)


In [17]:
# test, devも行列にしたいので関数にする
def make_matrices(words_and_tags):
    xs = []
    ys = []
    for words, tags in words_and_tags:
        for window in sliding_windows(words):
            xs.append([word_dict.get(word, word_dict[UNK]) for word in window])
        ys.extend(tag_dict[tag] for tag in tags)

    xs = np.array(xs, 'i')　# 'i'はint型の意味
    ys = np.array(ys, 'i')
    ys = keras.utils.to_categorical(ys, len(tag_dict))
    print('dimensions of xs', xs.shape)
    print('dimensions of ys', ys.shape)
    return xs, ys

In [18]:
train_xs, train_ys = make_matrices(train_sents)
test_xs, test_ys = make_matrices(test_sents)
dev_xs, dev_ys = make_matrices(dev_sents)

dimensions of xs (929552, 7)
dimensions of ys (929552, 17)
dimensions of xs (55371, 7)
dimensions of ys (55371, 17)
dimensions of xs (45422, 7)
dimensions of ys (45422, 17)


単語のID列${\bf x} = x_{-2}, x_{-1}, x, x_{+1}, x_{+2}$に対して  
$Embedding(\bf x) = [ {\bf e}_{x_{-2}}　| {\bf e}_{x_{-1}}　| {\bf e}_{x}　| {\bf e}_{x_{+1}}　|　{\bf e}_{x_{+2}} ]^T　= {\bf E}^T$,  
$ Reshape({\bf E}) = [ {\bf e}_{x_{-2}}, {\bf e}_{x_{-1}}, {\bf e}_{x}, {\bf e}_{x_{+1}}, {\bf e}_{x_{+2}} ]^T = {\bf e}$ (縦に並べる),  
$f({\bf x}) = {\mathit softmax}(W_3 \tanh (W_2 \tanh (W_1 {\bf e} + b_1) + b_2) + b_3)$.

In [24]:
VOCAB_SIZE = len(word_dict)  # 単語数
EMBED_DIM = 128                      # 埋め込みベクトルの次元数
HIDDEN1_DIM = 256                   # 隠れ層１
HIDDEN2_DIM = 128                   # 隠れ層２
NUM_TAGS = len(tag_dict)

model = Sequential()
model.add(Embedding(VOCAB_SIZE, EMBED_DIM))
model.add(Reshape((EMBED_DIM * 7,)))
model.add(Dense(HIDDEN1_DIM, activation='tanh'))
model.add(Dense(HIDDEN2_DIM, activation='tanh'))
model.add(Dense(NUM_TAGS, activation='softmax'))

In [25]:
keras.utils.plot_model(model, 'images/model.png')

計算グラフの可視化
<img src='model.png'>

In [26]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 128)         2760832   
_________________________________________________________________
reshape_2 (Reshape)          (None, 896)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 256)               229632    
_________________________________________________________________
dense_5 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_6 (Dense)              (None, 17)                2193      
Total params: 3,025,553
Trainable params: 3,025,553
Non-trainable params: 0
_________________________________________________________________


In [28]:
model.compile(loss='categorical_crossentropy',
              optimizer=SGD(),
              metrics=['accuracy'])

In [29]:
# 学習
model.fit(train_xs, train_ys,
                 batch_size=1024,
                 epochs=30,
                 verbose=1,
                 validation_data=(dev_xs, dev_ys))

# 学習がめんどい場合こっち (学習済みのパラメータを読み込み)
# model.load_weights('models/weights.h5')

Train on 929552 samples, validate on 45422 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
 80896/929552 [=>............................] - ETA: 1:13 - loss: 0.3285 - acc: 0.8936

KeyboardInterrupt: 

In [None]:
# 学習したモデルを適当に使ってみるとなんか行列が出てきます
model.predict(test_xs[:10])

In [None]:
# その行列の形
Out[32].shape

In [None]:
# POSタグとIDの逆向きの辞書
rev_tag_dict = {v: k for k, v in tag_dict.items()}

In [None]:
# なにかでてきました
[rev_tag_dict[i] for i in np.argmax(Out[32], 1)]

In [None]:
# 単語リストを入力してPOSタグを予測する関数
def predict(words):
    words = [PAD, PAD] + words + [PAD, PAD]
    ids = [word_dict.get(word, word_dict[UNK]) for word in words]
    windows = sliding_windows(ids)
    matrix = np.array(windows, 'i')
    probabilities = model.predict(matrix)
    result_ids = np.argmax(probabilities, 1)
    result = [rev_tag_dict[i] for i in result_ids]
    return result

In [None]:
predict(['this', 'is', 'a', 'test', 'sentence', '.'])

In [None]:
import random
for _ in range(5):
    i = random.randint(0, len(test_sents))
    words, tags = test_sents[i]
    words = words[2:-2]
    print('sentence:', words)
    print('predict:', predict(words))
    print('answer:', tags)

In [None]:
model.save_weights('models/weights.h5')