In [1]:
import numpy as np, keras
#from mr import mr
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, Dropout, Bidirectional, TimeDistributed, Masking
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
set_session(tf.Session(config=config))

In [3]:
embedding_dim = 200

In [4]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('unlabeled-vector.bin', binary=True, unicode_errors='ignore') 
print('Found %s word vectors.' % len(model.index2word))

Found 1466168 word vectors.


In [5]:
X_train = np.load('train_x.npy'); y_train = np.load('train_y.npy'); l_train = np.load('train_l.npy')
X_valid = np.load('valid_x.npy'); y_valid = np.load('valid_y.npy'); l_valid = np.load('valid_l.npy')
X_test  = np.load('test_x.npy' ); y_test  = np.load('test_y.npy' ); l_test  = np.load('test_l.npy' )

In [6]:
vocab_size = np.max(list(set.union(*[set(x) for x in X_train]))) + 1

In [7]:
max_length = len(X_train[0])

In [8]:
dictionary_fwd = np.load('dictionary_fwd.npy').item()

In [9]:
embedding_matrix = np.zeros((vocab_size, embedding_dim))
found = 0

for word, i in dictionary_fwd.iteritems():
    if word in model:
        if i >= vocab_size:
            continue
        
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = model[word]
        found += 1

print found, 'found of', len(dictionary_fwd)

8852 found of 10098


In [10]:
# create the model
model = Sequential()
model.add(Masking(mask_value=0, input_shape=(127,)))
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length, \
                    weights=[embedding_matrix], trainable=True))
model.add(Bidirectional(GRU(150, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)))
model.add(Bidirectional(GRU(150, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)))
model.add(Bidirectional(GRU(150, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)))
model.add(TimeDistributed(Dense(3, activation='softmax')))

In [11]:
print model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_1 (Masking)          (None, 127)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 127, 200)          2224000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 127, 300)          315900    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 127, 300)          405900    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 127, 300)          405900    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 127, 3)            903       
Total params: 3,352,603
Trainable params: 3,352,603
Non-trainable params: 0
_________________________________________________________________


In [12]:
from keras.callbacks import ModelCheckpoint

In [13]:
mcp = ModelCheckpoint('bidir_rnn_mesh.hdf5', monitor="val_acc", save_best_only=True, save_weights_only=False)

In [14]:
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [15]:
y_train2 = np.array([to_categorical(y, 3) for y in y_train])
y_valid2 = np.array([to_categorical(y, 3) for y in y_valid])
y_test2 = np.array([to_categorical(y, 3) for y in y_test])

In [16]:
model.fit(X_train, y_train2, epochs=30, batch_size=128, validation_data=(X_valid, y_valid2), callbacks=[mcp])

Train on 2558 samples, validate on 365 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f2e7f7eeed0>

In [17]:
model.load_weights('bidir_rnn_mesh.hdf5')

In [18]:
result = model.evaluate(X_test, y_test2, batch_size=128)



In [19]:
predictions = model.predict(X_test)

In [20]:
with open('predictions_rnn_mesh.txt', 'w') as f:
    for tokens, prediction in zip(X_test, predictions):
        started = False

        for tok, pred in zip(tokens, prediction):
            if not started and tok == 1:
                started = True
            elif started and tok == 2:
                f.write('\n')
                break
            elif started:
                if np.argmax(pred) == 0:
                    f.write('O\n')
                elif np.argmax(pred) == 1:
                    f.write('T\n')
                elif np.argmax(pred) == 2:
                    f.write('D\n')