### Описание задания:
Применение рекуррентных нейросетей для решения проблем автоматической морфологической разметки(Part Of Speech Tagging)

### Задачи:
1) Создать RNN нейросеть
2) Создать двунаправленную RNN нейросеть
3) Создать RNN нейросеть с использованием CRF

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import nltk
import os
import sys
import wget
import zipfile

import numpy as np
import tensorflow as tf

from collections import Counter
from collections import defaultdict

from IPython.display import HTML, display

import keras
from keras import layers
from keras.utils.np_utils import to_categorical

from sklearn.model_selection import train_test_split

In [3]:
tf.version.VERSION

'2.10.0'

### Получение данных

In [4]:
nltk.download('brown')
nltk.download('universal_tagset')
data = nltk.corpus.brown.tagged_sents(tagset='universal')
all_tags = ['#EOS#','#UNK#','ADV', 'NOUN', 'ADP', 'PRON', 'DET', '.', 'PRT', 'VERB', 'X', 'NUM', 'CONJ', 'ADJ']

data = np.array([ [(word.lower(),tag) for word,tag in sentence] for sentence in data ])

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\CaBa\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\CaBa\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [5]:
data.shape

(57340,)

In [6]:
data[:2][:]

array([list([('the', 'DET'), ('fulton', 'NOUN'), ('county', 'NOUN'), ('grand', 'ADJ'), ('jury', 'NOUN'), ('said', 'VERB'), ('friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')]),
       list([('the', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('city', 'NOUN'), ('executive', 'ADJ'), ('committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP')

In [7]:
def draw(sentence):
    words,tags = zip(*sentence)
    display(HTML('<table><tr>{tags}</tr>{words}<tr></table>'.format(
                words = '<td>{}</td>'.format('</td><td>'.join(words)),
                tags = '<td>{}</td>'.format('</td><td>'.join(tags)))))
    
    
draw(data[11])
draw(data[10])
draw(data[7])

0,1,2,3,4,5,6,7,8,9,10,11,12,13
NOUN,ADP,NOUN,NOUN,NOUN,NOUN,VERB,ADV,VERB,ADP,DET,ADJ,NOUN,.
,,,,,,,,,,,,,


0,1,2,3,4,5,6,7,8,9,10,11,12,13
PRON,VERB,ADP,DET,NOUN,.,VERB,NOUN,PRT,VERB,.,DET,NOUN,.
,,,,,,,,,,,,,


0,1
NOUN,VERB
,


### Создание словарей для words и tags

In [8]:
word_counts = Counter()
for sentence in data:
    words,tags = zip(*sentence)
    word_counts.update(words)

all_words = ['#EOS#','#UNK#'] + list(list(zip(*word_counts.most_common(10000)))[0])

print("Coverage = %.5f" % (float(sum(word_counts[w] for w in all_words)) / sum(word_counts.values())))

Coverage = 0.92876


In [9]:
word_to_id = defaultdict(lambda: 1, { word: ind for ind, word in enumerate(all_words) })
tag_to_id = { tag: ind for ind, tag in enumerate(all_tags)}

Преобразование words и tags в матрицу фиксированного размера

In [10]:
def to_matrix(lines, token_to_id, max_len=None, pad=0, dtype='int32', time_major=False):
    """Converts a list of names into rnn-digestable matrix with paddings added after the end"""
    
    max_len = max_len or max(map(len,lines))
    matrix = np.empty([len(lines), max_len],dtype)
    matrix.fill(pad)

    for i in range(len(lines)):
        line_ix = list(map(token_to_id.__getitem__,lines[i]))[:max_len]
        matrix[i,:len(line_ix)] = line_ix

    return matrix.T if time_major else matrix

In [11]:
batch_words, batch_tags = zip(*[zip(*sentence) for sentence in data[-3:]])

print("Word ids:")
print(to_matrix(batch_words, word_to_id))
print("Tag ids:")
print(to_matrix(batch_tags, tag_to_id))

Word ids:
[[   2 3057    5    2 2238 1334 4238 2454    3    6   19   26 1070   69
     8 2088    6    3    1    3  266   65  342    2    1    3    2  315
     1    9   87  216 3322   69 1558    4    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]
 [  45   12    8  511 8419    6   60 3246   39    2    1    1    3    2
   845    1    3    1    3   10 9910    2    1 3470    9   43    1    1
     3    6    2 1046  385   73 4562    3    9    2    1    1 3250    3
    12   10    2  861 5240   12    8 8936  121    1    4]
 [  33   64   26   12  445    7 7346    9    8 3337    3    1 2811    3
     2  463  572    2    1    1 1649   12    1    4    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]]
Tag ids:
[[ 6  3  4  6  3  3  9  9  7 12  4  5  9  4  6  3 12  7  9  7  9  8  4  6
   3  7  6 13  3  4  6  3  9  4  3  7  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0

### Создание RNN нейросети

In [12]:
train_data, test_data = train_test_split(data, test_size=0.25, random_state=21)

In [13]:
print("Train size = ", len(train_data))
print("Test size = ", len(test_data))

Train size =  43005
Test size =  14335


In [14]:
model = keras.models.Sequential()
model.add(layers.InputLayer([None], dtype='int32'))
model.add(layers.Embedding(len(all_words), 50))
model.add(layers.SimpleRNN(64, return_sequences=True))

#add top layer that predicts tag probabilities
stepwise_dense = layers.Dense(len(all_tags), activation='softmax')
stepwise_dense = layers.TimeDistributed(stepwise_dense)
model.add(stepwise_dense)

Создание генератора, который возвращает по одному batch за раз

In [15]:
BATCH_SIZE=32
def generate_batches(sentences, batch_size=BATCH_SIZE, max_len=None, pad=0):
    assert isinstance(sentences, np.ndarray),"Make sure sentences is q numpy array"
    
    while True:
        indices = np.random.permutation(np.arange(len(sentences)))
        for start in range(0, len(indices) - 1, batch_size):
            batch_indices = indices[start:start + batch_size]
            batch_words, batch_tags = [], []
            for sent in sentences[batch_indices]:
                words,tags = zip(*sent)
                batch_words.append(words)
                batch_tags.append(tags)

            batch_words = to_matrix(batch_words, word_to_id, max_len,pad)
            batch_tags = to_matrix(batch_tags, tag_to_id, max_len,pad)

            batch_tags_1hot = to_categorical(batch_tags, len(all_tags)).reshape(batch_tags.shape + (-1,))
            yield batch_words, batch_tags_1hot

Создание метода для измерение производительности модели.

In [16]:
def compute_test_accuracy(model):
    test_words, test_tags = zip(*[zip(*sentence) for sentence in test_data])
    test_words, test_tags = to_matrix(test_words, word_to_id),to_matrix(test_tags, tag_to_id)

    #predict tag probabilities of shape [batch,time,n_tags]
    predicted_tag_probabilities = model.predict(test_words, verbose=1)
    predicted_tags = predicted_tag_probabilities.argmax(axis=-1)

    #compute accurary excluding padding
    numerator = np.sum(np.logical_and((predicted_tags == test_tags), (test_words != 0)))
    denominator = np.sum(test_words != 0)
    return float(numerator) / denominator


class EvaluateAccuracy(keras.callbacks.Callback):
    def on_epoch_end(self,epoch, logs=None):
        sys.stdout.flush()
        print("\nMeasuring validation accuracy...")
        acc = compute_test_accuracy(self.model)
        print("\nValidation accuracy: %.5f\n"%acc)
        sys.stdout.flush()

In [17]:
model.compile('adam','categorical_crossentropy')

model.fit_generator(generate_batches(train_data), len(train_data) / BATCH_SIZE, callbacks=[EvaluateAccuracy()], epochs=5)

Epoch 1/5
Measuring validation accuracy...

Validation accuracy: 0.93987

Epoch 2/5
Measuring validation accuracy...

Validation accuracy: 0.94431

Epoch 3/5
Measuring validation accuracy...

Validation accuracy: 0.94670

Epoch 4/5
Measuring validation accuracy...

Validation accuracy: 0.94631

Epoch 5/5
Measuring validation accuracy...

Validation accuracy: 0.94475



<keras.callbacks.History at 0x1fe4b2467a0>

In [18]:
acc = compute_test_accuracy(model)
print("Final accuracy: %.5f"%acc)

assert acc > 0.94, "Keras has gone on a rampage again, please contact course staff."

Final accuracy: 0.94475


### Создание двунаправленной RNN нейросети.

In [22]:
bidirect_model = keras.models.Sequential()
bidirect_model.add(layers.InputLayer([None], dtype='int32'))
bidirect_model.add(layers.Embedding(len(all_words), 50))
bidirect_model.add(layers.Bidirectional(layers.SimpleRNN(64, return_sequences=True)))
bidirect_model.add(layers.Bidirectional(layers.LSTM(24, return_sequences=True)))

stepwise_dense = layers.Dense(len(all_tags), activation='softmax')
stepwise_dense = layers.TimeDistributed(stepwise_dense)
bidirect_model.add(stepwise_dense)

In [23]:
bidirect_model.compile('adam', 'categorical_crossentropy')

bidirect_model.fit_generator(generate_batches(train_data),len(train_data) / BATCH_SIZE,
                    callbacks=[EvaluateAccuracy()], epochs=5,)

Epoch 1/5
Measuring validation accuracy...

Validation accuracy: 0.95681

Epoch 2/5
Measuring validation accuracy...

Validation accuracy: 0.96125

Epoch 3/5
Measuring validation accuracy...

Validation accuracy: 0.96351

Epoch 4/5
Measuring validation accuracy...

Validation accuracy: 0.96380

Epoch 5/5
Measuring validation accuracy...

Validation accuracy: 0.96261



<keras.callbacks.History at 0x1fe7d8e6380>

In [24]:
acc = compute_test_accuracy(bidirect_model)
print("\nFinal accuracy: %.5f"%acc)

assert acc > 0.96, "Bidirectional RNNs are better than this!"
print("Well done!")


Final accuracy: 0.96261
Well done!


### Создание двунаправленной RNN нейросети c использованием Conditional Random Fields

CRF входит в библиотку keras_contrib. Установка: !pip install git+https://www.github.com/keras-team/keras-contrib.git

In [25]:
from keras_contrib.layers import CRF

In [28]:
crf_model = keras.models.Sequential()
crf_model.add(layers.InputLayer([None], dtype='int32'))
crf_model.add(layers.Embedding(len(all_words), 50))
crf_model.add(layers.Bidirectional(layers.SimpleRNN(64, return_sequences=True)))
crf_model.add(layers.Bidirectional(layers.LSTM(24, return_sequences=True)))

stepwise_dense = CRF(len(all_tags), sparse_target=True)
stepwise_dense = layers.Dense(len(all_tags), activation='softmax')
stepwise_dense = layers.TimeDistributed(stepwise_dense)
crf_model.add(stepwise_dense)

In [29]:
crf_model.compile('adam', 'categorical_crossentropy')

crf_model.fit_generator(generate_batches(train_data), len(train_data) / BATCH_SIZE,
                    callbacks=[EvaluateAccuracy()], epochs=5,)

Epoch 1/5
Measuring validation accuracy...

Validation accuracy: 0.95495

Epoch 2/5
Measuring validation accuracy...

Validation accuracy: 0.96133

Epoch 3/5
Measuring validation accuracy...

Validation accuracy: 0.96257

Epoch 4/5
Measuring validation accuracy...

Validation accuracy: 0.96340

Epoch 5/5
Measuring validation accuracy...

Validation accuracy: 0.96194



<keras.callbacks.History at 0x1fe7ff49300>

In [48]:
acc = compute_test_accuracy(crf_model)
print("\nCRF accuracy: %.5f"%acc)


CRF accuracy: 0.96194


### Использование предобученных GloVe embeddings

**Внимание:** Размер предобученного GloVe embeddings - 822M

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip --no-check-certificate

In [36]:
with zipfile.ZipFile('glove.6B.zip', 'r') as zip_data:
    zip_data.extractall()

In [42]:
path_to_glove_file = 'glove.6B.100d.txt'

embeddings_index = {}
with open(path_to_glove_file, encoding='utf-8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [43]:
num_tokens = len(all_words) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_to_id.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 40481 words (9336 misses)


In [51]:
glove_model = keras.models.Sequential()
glove_model.add(layers.InputLayer([None], dtype='int32'))
glove_model.add(layers.Embedding(num_tokens,
                                 embedding_dim,
                                 embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                                 trainable=False,))
glove_model.add(layers.Bidirectional(layers.SimpleRNN(64, return_sequences=True)))
glove_model.add(layers.Bidirectional(layers.LSTM(24, return_sequences=True)))

glove_stepwise_dense = CRF(len(all_tags), sparse_target=True)
glove_stepwise_dense = layers.Dense(len(all_tags), activation='softmax')
glove_stepwise_dense = layers.TimeDistributed(glove_stepwise_dense)
glove_model.add(glove_stepwise_dense)

In [52]:
glove_model.compile('adam', 'categorical_crossentropy')

glove_model.fit_generator(generate_batches(train_data), len(train_data) / BATCH_SIZE,
                    callbacks=[EvaluateAccuracy()], epochs=7,)

Epoch 1/7
Measuring validation accuracy...

Validation accuracy: 0.93344

Epoch 2/7
Measuring validation accuracy...

Validation accuracy: 0.94736

Epoch 3/7
Measuring validation accuracy...

Validation accuracy: 0.95350

Epoch 4/7
Measuring validation accuracy...

Validation accuracy: 0.95620

Epoch 5/7
Measuring validation accuracy...

Validation accuracy: 0.95910

Epoch 6/7
Measuring validation accuracy...

Validation accuracy: 0.96033

Epoch 7/7
Measuring validation accuracy...

Validation accuracy: 0.96150



<keras.callbacks.History at 0x1feaae9d720>

In [54]:
acc = compute_test_accuracy(glove_model)
print("\nRNN with GloVe accuracy: %.5f"%acc)


RNN with GloVe accuracy: 0.96150


### Выводы

В рамках проекта были реализованы различные схемы рекуррентных нейросетей:
- однонаправленная RNN нейросеть
- двунаправленная RNN нейросеть
- двунаправленная RNN нейросеть c использованием Conditional Random Fields

Также для нейросети был применён предобученный GloVe embeddings

### Для проверяющего

Предполагалось, что применение доп. методов улучшит Validation accuracy. Но нейросети с применением CRF и предобученного GloVe embeddings показали такую же Validation accuracy как и обычная двунаправленная RNN нейросеть. 

Подскажите, с чем это связано, и как можно улучшить работу сети?